aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2008-10-15 13:46:29 +0200
committerIngo Molnar <mingo@elte.hu>2008-10-15 13:46:29 +0200
commitb2aaf8f74cdc84a9182f6cabf198b7763bcb9d40 (patch)
tree53ccb1c2c14751fe69cf93102e76e97021f6df07 /fs
parent4f962d4d65923d7b722192e729840cfb79af0a5a (diff)
parent278429cff8809958d25415ba0ed32b59866ab1a8 (diff)
Merge branch 'linus' into stackprotector
Conflicts: arch/x86/kernel/Makefile include/asm-x86/pda.h
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs.c2
-rw-r--r--fs/9p/v9fs_vfs.h2
-rw-r--r--fs/9p/vfs_dir.c1
-rw-r--r--fs/9p/vfs_file.c4
-rw-r--r--fs/9p/vfs_inode.c21
-rw-r--r--fs/Kconfig389
-rw-r--r--fs/Kconfig.binfmt8
-rw-r--r--fs/Makefile8
-rw-r--r--fs/adfs/dir.c1
-rw-r--r--fs/adfs/super.c4
-rw-r--r--fs/affs/affs.h3
-rw-r--r--fs/affs/bitmap.c18
-rw-r--r--fs/affs/dir.c1
-rw-r--r--fs/affs/file.c4
-rw-r--r--fs/affs/super.c6
-rw-r--r--fs/afs/internal.h4
-rw-r--r--fs/afs/mntpt.c2
-rw-r--r--fs/afs/security.c2
-rw-r--r--fs/afs/super.c6
-rw-r--r--fs/afs/write.c2
-rw-r--r--fs/aio.c8
-rw-r--r--fs/anon_inodes.c11
-rw-r--r--fs/attr.c7
-rw-r--r--fs/autofs/inode.c2
-rw-r--r--fs/autofs4/autofs_i.h28
-rw-r--r--fs/autofs4/expire.c91
-rw-r--r--fs/autofs4/inode.c35
-rw-r--r--fs/autofs4/root.c591
-rw-r--r--fs/autofs4/waitq.c267
-rw-r--r--fs/bad_inode.c3
-rw-r--r--fs/befs/linuxvfs.c5
-rw-r--r--fs/bfs/bfs.h5
-rw-r--r--fs/bfs/dir.c46
-rw-r--r--fs/bfs/file.c4
-rw-r--r--fs/bfs/inode.c29
-rw-r--r--fs/binfmt_aout.c6
-rw-r--r--fs/binfmt_elf.c106
-rw-r--r--fs/binfmt_elf_fdpic.c78
-rw-r--r--fs/binfmt_flat.c7
-rw-r--r--fs/binfmt_misc.c24
-rw-r--r--fs/binfmt_som.c2
-rw-r--r--fs/bio-integrity.c745
-rw-r--r--fs/bio.c408
-rw-r--r--fs/block_dev.c199
-rw-r--r--fs/buffer.c108
-rw-r--r--fs/char_dev.c7
-rw-r--r--fs/cifs/CHANGES16
-rw-r--r--fs/cifs/README44
-rw-r--r--fs/cifs/asn1.c265
-rw-r--r--fs/cifs/cifs_debug.c696
-rw-r--r--fs/cifs/cifs_dfs_ref.c2
-rw-r--r--fs/cifs/cifs_spnego.c45
-rw-r--r--fs/cifs/cifs_spnego.h2
-rw-r--r--fs/cifs/cifsacl.c51
-rw-r--r--fs/cifs/cifsencrypt.c4
-rw-r--r--fs/cifs/cifsfs.c79
-rw-r--r--fs/cifs/cifsfs.h4
-rw-r--r--fs/cifs/cifsglob.h10
-rw-r--r--fs/cifs/cifspdu.h10
-rw-r--r--fs/cifs/cifsproto.h28
-rw-r--r--fs/cifs/cifssmb.c140
-rw-r--r--fs/cifs/connect.c223
-rw-r--r--fs/cifs/dir.c67
-rw-r--r--fs/cifs/dns_resolve.c77
-rw-r--r--fs/cifs/file.c155
-rw-r--r--fs/cifs/inode.c1036
-rw-r--r--fs/cifs/misc.c8
-rw-r--r--fs/cifs/readdir.c129
-rw-r--r--fs/cifs/sess.c17
-rw-r--r--fs/cifs/transport.c4
-rw-r--r--fs/coda/coda_linux.c6
-rw-r--r--fs/coda/dir.c4
-rw-r--r--fs/coda/inode.c2
-rw-r--r--fs/coda/pioctl.c20
-rw-r--r--fs/coda/psdev.c9
-rw-r--r--fs/coda/upcall.c15
-rw-r--r--fs/compat.c50
-rw-r--r--fs/compat_ioctl.c123
-rw-r--r--fs/configfs/configfs_internal.h7
-rw-r--r--fs/configfs/dir.c360
-rw-r--r--fs/configfs/inode.c2
-rw-r--r--fs/configfs/symlink.c42
-rw-r--r--fs/cramfs/inode.c84
-rw-r--r--fs/dcache.c516
-rw-r--r--fs/debugfs/inode.c117
-rw-r--r--fs/devpts/inode.c84
-rw-r--r--fs/direct-io.c10
-rw-r--r--fs/dlm/config.c248
-rw-r--r--fs/dlm/dlm_internal.h7
-rw-r--r--fs/dlm/lock.c4
-rw-r--r--fs/dlm/lockspace.c158
-rw-r--r--fs/dlm/lockspace.h1
-rw-r--r--fs/dlm/lowcomms.c4
-rw-r--r--fs/dlm/plock.c2
-rw-r--r--fs/dlm/user.c125
-rw-r--r--fs/dlm/user.h4
-rw-r--r--fs/dquot.c168
-rw-r--r--fs/ecryptfs/Makefile2
-rw-r--r--fs/ecryptfs/crypto.c67
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h23
-rw-r--r--fs/ecryptfs/file.c20
-rw-r--r--fs/ecryptfs/inode.c52
-rw-r--r--fs/ecryptfs/keystore.c9
-rw-r--r--fs/ecryptfs/kthread.c203
-rw-r--r--fs/ecryptfs/main.c85
-rw-r--r--fs/ecryptfs/miscdev.c61
-rw-r--r--fs/ecryptfs/mmap.c11
-rw-r--r--fs/efs/namei.c3
-rw-r--r--fs/efs/super.c4
-rw-r--r--fs/eventfd.c17
-rw-r--r--fs/eventpoll.c35
-rw-r--r--fs/exec.c239
-rw-r--r--fs/ext2/acl.c2
-rw-r--r--fs/ext2/acl.h2
-rw-r--r--fs/ext2/ext2.h2
-rw-r--r--fs/ext2/file.c1
-rw-r--r--fs/ext2/inode.c9
-rw-r--r--fs/ext2/super.c5
-rw-r--r--fs/ext2/xattr_security.c2
-rw-r--r--fs/ext2/xattr_trusted.c4
-rw-r--r--fs/ext2/xattr_user.c4
-rw-r--r--fs/ext3/acl.c2
-rw-r--r--fs/ext3/acl.h2
-rw-r--r--fs/ext3/dir.c14
-rw-r--r--fs/ext3/file.c1
-rw-r--r--fs/ext3/ialloc.c9
-rw-r--r--fs/ext3/inode.c121
-rw-r--r--fs/ext3/namei.c26
-rw-r--r--fs/ext3/super.c89
-rw-r--r--fs/ext3/xattr_security.c2
-rw-r--r--fs/ext3/xattr_trusted.c4
-rw-r--r--fs/ext3/xattr_user.c4
-rw-r--r--fs/ext4/Makefile10
-rw-r--r--fs/ext4/acl.c190
-rw-r--r--fs/ext4/acl.h12
-rw-r--r--fs/ext4/balloc.c1572
-rw-r--r--fs/ext4/bitmap.c6
-rw-r--r--fs/ext4/dir.c101
-rw-r--r--fs/ext4/ext4.h193
-rw-r--r--fs/ext4/ext4_extents.h20
-rw-r--r--fs/ext4/ext4_i.h49
-rw-r--r--fs/ext4/ext4_jbd2.h29
-rw-r--r--fs/ext4/ext4_sb.h30
-rw-r--r--fs/ext4/extents.c556
-rw-r--r--fs/ext4/file.c30
-rw-r--r--fs/ext4/fsync.c11
-rw-r--r--fs/ext4/group.h2
-rw-r--r--fs/ext4/hash.c8
-rw-r--r--fs/ext4/ialloc.c238
-rw-r--r--fs/ext4/inode.c2439
-rw-r--r--fs/ext4/ioctl.c96
-rw-r--r--fs/ext4/mballoc.c856
-rw-r--r--fs/ext4/mballoc.h11
-rw-r--r--fs/ext4/migrate.c13
-rw-r--r--fs/ext4/namei.c447
-rw-r--r--fs/ext4/resize.c147
-rw-r--r--fs/ext4/super.c757
-rw-r--r--fs/ext4/symlink.c8
-rw-r--r--fs/ext4/xattr.c18
-rw-r--r--fs/ext4/xattr.h12
-rw-r--r--fs/ext4/xattr_trusted.c4
-rw-r--r--fs/ext4/xattr_user.c4
-rw-r--r--fs/fat/cache.c4
-rw-r--r--fs/fat/dir.c233
-rw-r--r--fs/fat/fatent.c14
-rw-r--r--fs/fat/file.c27
-rw-r--r--fs/fat/inode.c72
-rw-r--r--fs/fat/misc.c10
-rw-r--r--fs/fcntl.c187
-rw-r--r--fs/fifo.c8
-rw-r--r--fs/file.c70
-rw-r--r--fs/file_table.c10
-rw-r--r--fs/fs-writeback.c22
-rw-r--r--fs/fuse/dir.c145
-rw-r--r--fs/fuse/file.c13
-rw-r--r--fs/fuse/fuse_i.h10
-rw-r--r--fs/fuse/inode.c181
-rw-r--r--fs/gfs2/Kconfig18
-rw-r--r--fs/gfs2/Makefile1
-rw-r--r--fs/gfs2/gfs2.h5
-rw-r--r--fs/gfs2/glock.c1652
-rw-r--r--fs/gfs2/glock.h12
-rw-r--r--fs/gfs2/glops.c70
-rw-r--r--fs/gfs2/incore.h76
-rw-r--r--fs/gfs2/inode.c174
-rw-r--r--fs/gfs2/inode.h6
-rw-r--r--fs/gfs2/locking.c52
-rw-r--r--fs/gfs2/locking/dlm/lock.c368
-rw-r--r--fs/gfs2/locking/dlm/lock_dlm.h18
-rw-r--r--fs/gfs2/locking/dlm/mount.c17
-rw-r--r--fs/gfs2/locking/dlm/sysfs.c13
-rw-r--r--fs/gfs2/locking/dlm/thread.c331
-rw-r--r--fs/gfs2/locking/nolock/Makefile3
-rw-r--r--fs/gfs2/locking/nolock/main.c238
-rw-r--r--fs/gfs2/log.c23
-rw-r--r--fs/gfs2/log.h2
-rw-r--r--fs/gfs2/main.c6
-rw-r--r--fs/gfs2/meta_io.c14
-rw-r--r--fs/gfs2/meta_io.h1
-rw-r--r--fs/gfs2/mount.c9
-rw-r--r--fs/gfs2/ops_address.c54
-rw-r--r--fs/gfs2/ops_export.c2
-rw-r--r--fs/gfs2/ops_file.c58
-rw-r--r--fs/gfs2/ops_fstype.c586
-rw-r--r--fs/gfs2/ops_inode.c144
-rw-r--r--fs/gfs2/ops_super.c112
-rw-r--r--fs/gfs2/quota.c2
-rw-r--r--fs/gfs2/recovery.c5
-rw-r--r--fs/gfs2/rgrp.c108
-rw-r--r--fs/gfs2/super.c346
-rw-r--r--fs/gfs2/super.h6
-rw-r--r--fs/gfs2/sys.c27
-rw-r--r--fs/hfs/bitmap.c8
-rw-r--r--fs/hfs/btree.c2
-rw-r--r--fs/hfs/extent.c14
-rw-r--r--fs/hfs/hfs_fs.h5
-rw-r--r--fs/hfs/inode.c11
-rw-r--r--fs/hfs/super.c6
-rw-r--r--fs/hfsplus/extents.c14
-rw-r--r--fs/hfsplus/hfsplus_fs.h3
-rw-r--r--fs/hfsplus/inode.c10
-rw-r--r--fs/hfsplus/options.c2
-rw-r--r--fs/hfsplus/super.c4
-rw-r--r--fs/hostfs/hostfs_kern.c2
-rw-r--r--fs/hpfs/namei.c2
-rw-r--r--fs/hpfs/super.c4
-rw-r--r--fs/hppfs/hppfs.c7
-rw-r--r--fs/hugetlbfs/inode.c105
-rw-r--r--fs/inode.c5
-rw-r--r--fs/inotify_user.c67
-rw-r--r--fs/ioctl.c277
-rw-r--r--fs/ioprio.c8
-rw-r--r--fs/isofs/inode.c4
-rw-r--r--fs/isofs/rock.c22
-rw-r--r--fs/jbd/commit.c68
-rw-r--r--fs/jbd/journal.c8
-rw-r--r--fs/jbd/revoke.c163
-rw-r--r--fs/jbd/transaction.c61
-rw-r--r--fs/jbd2/checkpoint.c72
-rw-r--r--fs/jbd2/commit.c336
-rw-r--r--fs/jbd2/journal.c157
-rw-r--r--fs/jbd2/recovery.c7
-rw-r--r--fs/jbd2/transaction.c369
-rw-r--r--fs/jffs2/acl.c2
-rw-r--r--fs/jffs2/acl.h2
-rw-r--r--fs/jffs2/dir.c2
-rw-r--r--fs/jffs2/file.c2
-rw-r--r--fs/jffs2/ioctl.c3
-rw-r--r--fs/jffs2/jffs2_fs_i.h1
-rw-r--r--fs/jffs2/os-linux.h2
-rw-r--r--fs/jffs2/summary.c40
-rw-r--r--fs/jffs2/summary.h6
-rw-r--r--fs/jffs2/super.c2
-rw-r--r--fs/jfs/acl.c2
-rw-r--r--fs/jfs/jfs_acl.h2
-rw-r--r--fs/jfs/jfs_debug.c62
-rw-r--r--fs/jfs/jfs_debug.h10
-rw-r--r--fs/jfs/jfs_dtree.h3
-rw-r--r--fs/jfs/jfs_imap.c2
-rw-r--r--fs/jfs/jfs_logmgr.c35
-rw-r--r--fs/jfs/jfs_metapage.c38
-rw-r--r--fs/jfs/jfs_txnmgr.c68
-rw-r--r--fs/jfs/jfs_xtree.c36
-rw-r--r--fs/jfs/namei.c2
-rw-r--r--fs/jfs/super.c12
-rw-r--r--fs/libfs.c32
-rw-r--r--fs/lockd/Makefile2
-rw-r--r--fs/lockd/clntlock.c13
-rw-r--r--fs/lockd/clntproc.c18
-rw-r--r--fs/lockd/grace.c59
-rw-r--r--fs/lockd/host.c350
-rw-r--r--fs/lockd/mon.c2
-rw-r--r--fs/lockd/svc.c117
-rw-r--r--fs/lockd/svc4proc.c44
-rw-r--r--fs/lockd/svclock.c69
-rw-r--r--fs/lockd/svcproc.c44
-rw-r--r--fs/lockd/svcsubs.c32
-rw-r--r--fs/lockd/xdr.c2
-rw-r--r--fs/lockd/xdr4.c2
-rw-r--r--fs/locks.c98
-rw-r--r--fs/minix/inode.c5
-rw-r--r--fs/minix/minix.h6
-rw-r--r--fs/minix/namei.c24
-rw-r--r--fs/mpage.c14
-rw-r--r--fs/msdos/namei.c56
-rw-r--r--fs/namei.c395
-rw-r--r--fs/namespace.c139
-rw-r--r--fs/ncpfs/dir.c4
-rw-r--r--fs/ncpfs/file.c12
-rw-r--r--fs/ncpfs/inode.c2
-rw-r--r--fs/nfs/callback.c37
-rw-r--r--fs/nfs/client.c13
-rw-r--r--fs/nfs/dir.c101
-rw-r--r--fs/nfs/direct.c4
-rw-r--r--fs/nfs/file.c161
-rw-r--r--fs/nfs/inode.c81
-rw-r--r--fs/nfs/internal.h1
-rw-r--r--fs/nfs/iostat.h119
-rw-r--r--fs/nfs/namespace.c2
-rw-r--r--fs/nfs/nfs3acl.c9
-rw-r--r--fs/nfs/nfs3proc.c275
-rw-r--r--fs/nfs/nfs4proc.c265
-rw-r--r--fs/nfs/nfs4state.c2
-rw-r--r--fs/nfs/nfsroot.c12
-rw-r--r--fs/nfs/proc.c28
-rw-r--r--fs/nfs/super.c894
-rw-r--r--fs/nfs/unlink.c3
-rw-r--r--fs/nfs/write.c322
-rw-r--r--fs/nfsd/export.c6
-rw-r--r--fs/nfsd/lockd.c16
-rw-r--r--fs/nfsd/nfs2acl.c7
-rw-r--r--fs/nfsd/nfs3acl.c5
-rw-r--r--fs/nfsd/nfs3proc.c12
-rw-r--r--fs/nfsd/nfs4acl.c2
-rw-r--r--fs/nfsd/nfs4callback.c9
-rw-r--r--fs/nfsd/nfs4proc.c95
-rw-r--r--fs/nfsd/nfs4state.c83
-rw-r--r--fs/nfsd/nfs4xdr.c561
-rw-r--r--fs/nfsd/nfsctl.c124
-rw-r--r--fs/nfsd/nfsfh.c61
-rw-r--r--fs/nfsd/nfsproc.c13
-rw-r--r--fs/nfsd/nfssvc.c168
-rw-r--r--fs/nfsd/vfs.c215
-rw-r--r--fs/ntfs/aops.c2
-rw-r--r--fs/ntfs/compress.c2
-rw-r--r--fs/ntfs/file.c2
-rw-r--r--fs/ntfs/mft.c4
-rw-r--r--fs/ntfs/namei.c89
-rw-r--r--fs/ntfs/super.c2
-rw-r--r--fs/ntfs/usnjrnl.h4
-rw-r--r--fs/ocfs2/Makefile3
-rw-r--r--fs/ocfs2/alloc.c922
-rw-r--r--fs/ocfs2/alloc.h95
-rw-r--r--fs/ocfs2/aops.c98
-rw-r--r--fs/ocfs2/buffer_head_io.c134
-rw-r--r--fs/ocfs2/buffer_head_io.h23
-rw-r--r--fs/ocfs2/cluster/heartbeat.c10
-rw-r--r--fs/ocfs2/cluster/masklog.c1
-rw-r--r--fs/ocfs2/cluster/masklog.h1
-rw-r--r--fs/ocfs2/cluster/netdebug.c28
-rw-r--r--fs/ocfs2/cluster/nodemanager.c16
-rw-r--r--fs/ocfs2/cluster/tcp.c44
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h32
-rw-r--r--fs/ocfs2/dir.c120
-rw-r--r--fs/ocfs2/dlm/dlmfs.c3
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c2
-rw-r--r--fs/ocfs2/dlmglue.c145
-rw-r--r--fs/ocfs2/extent_map.c386
-rw-r--r--fs/ocfs2/extent_map.h7
-rw-r--r--fs/ocfs2/file.c340
-rw-r--r--fs/ocfs2/file.h35
-rw-r--r--fs/ocfs2/inode.c87
-rw-r--r--fs/ocfs2/inode.h6
-rw-r--r--fs/ocfs2/ioctl.c3
-rw-r--r--fs/ocfs2/journal.c281
-rw-r--r--fs/ocfs2/journal.h55
-rw-r--r--fs/ocfs2/localalloc.c386
-rw-r--r--fs/ocfs2/localalloc.h4
-rw-r--r--fs/ocfs2/locks.c15
-rw-r--r--fs/ocfs2/locks.h1
-rw-r--r--fs/ocfs2/namei.c101
-rw-r--r--fs/ocfs2/ocfs2.h70
-rw-r--r--fs/ocfs2/ocfs2_fs.h227
-rw-r--r--fs/ocfs2/ocfs2_jbd_compat.h82
-rw-r--r--fs/ocfs2/resize.c11
-rw-r--r--fs/ocfs2/slot_map.c7
-rw-r--r--fs/ocfs2/stack_user.c55
-rw-r--r--fs/ocfs2/stackglue.c27
-rw-r--r--fs/ocfs2/stackglue.h19
-rw-r--r--fs/ocfs2/suballoc.c248
-rw-r--r--fs/ocfs2/suballoc.h26
-rw-r--r--fs/ocfs2/super.c82
-rw-r--r--fs/ocfs2/symlink.c18
-rw-r--r--fs/ocfs2/uptodate.c38
-rw-r--r--fs/ocfs2/uptodate.h3
-rw-r--r--fs/ocfs2/xattr.c4834
-rw-r--r--fs/ocfs2/xattr.h68
-rw-r--r--fs/omfs/Makefile4
-rw-r--r--fs/omfs/bitmap.c193
-rw-r--r--fs/omfs/dir.c504
-rw-r--r--fs/omfs/file.c365
-rw-r--r--fs/omfs/inode.c553
-rw-r--r--fs/omfs/omfs.h67
-rw-r--r--fs/omfs/omfs_fs.h80
-rw-r--r--fs/open.c276
-rw-r--r--fs/openpromfs/inode.c2
-rw-r--r--fs/partitions/check.c284
-rw-r--r--fs/partitions/check.h4
-rw-r--r--fs/partitions/efi.c42
-rw-r--r--fs/partitions/ldm.c70
-rw-r--r--fs/partitions/ldm.h5
-rw-r--r--fs/pipe.c96
-rw-r--r--fs/proc/Kconfig69
-rw-r--r--fs/proc/array.c75
-rw-r--r--fs/proc/base.c120
-rw-r--r--fs/proc/generic.c51
-rw-r--r--fs/proc/inode.c90
-rw-r--r--fs/proc/internal.h10
-rw-r--r--fs/proc/kcore.c10
-rw-r--r--fs/proc/kmsg.c2
-rw-r--r--fs/proc/nommu.c4
-rw-r--r--fs/proc/proc_misc.c47
-rw-r--r--fs/proc/proc_net.c43
-rw-r--r--fs/proc/proc_sysctl.c433
-rw-r--r--fs/proc/proc_tty.c48
-rw-r--r--fs/proc/task_mmu.c102
-rw-r--r--fs/proc/task_nommu.c5
-rw-r--r--fs/proc/vmcore.c6
-rw-r--r--fs/qnx4/inode.c2
-rw-r--r--fs/quota.c18
-rw-r--r--fs/quota_v1.c1
-rw-r--r--fs/quota_v2.c1
-rw-r--r--fs/ramfs/file-mmu.c1
-rw-r--r--fs/ramfs/file-nommu.c3
-rw-r--r--fs/read_write.c38
-rw-r--r--fs/readdir.c8
-rw-r--r--fs/reiserfs/inode.c4
-rw-r--r--fs/reiserfs/journal.c48
-rw-r--r--fs/reiserfs/super.c143
-rw-r--r--fs/reiserfs/xattr.c2
-rw-r--r--fs/reiserfs/xattr_security.c2
-rw-r--r--fs/reiserfs/xattr_trusted.c2
-rw-r--r--fs/reiserfs/xattr_user.c2
-rw-r--r--fs/romfs/inode.c39
-rw-r--r--fs/seq_file.c25
-rw-r--r--fs/signalfd.c19
-rw-r--r--fs/smbfs/cache.c1
-rw-r--r--fs/smbfs/file.c15
-rw-r--r--fs/smbfs/inode.c2
-rw-r--r--fs/smbfs/proc.c1
-rw-r--r--fs/splice.c67
-rw-r--r--fs/stat.c32
-rw-r--r--fs/super.c1
-rw-r--r--fs/sync.c3
-rw-r--r--fs/sysfs/dir.c34
-rw-r--r--fs/sysfs/file.c8
-rw-r--r--fs/sysfs/group.c3
-rw-r--r--fs/sysfs/symlink.c41
-rw-r--r--fs/sysfs/sysfs.h1
-rw-r--r--fs/sysv/inode.c2
-rw-r--r--fs/timerfd.c9
-rw-r--r--fs/ubifs/Kconfig72
-rw-r--r--fs/ubifs/Makefile9
-rw-r--r--fs/ubifs/budget.c812
-rw-r--r--fs/ubifs/commit.c678
-rw-r--r--fs/ubifs/compress.c253
-rw-r--r--fs/ubifs/debug.c2290
-rw-r--r--fs/ubifs/debug.h398
-rw-r--r--fs/ubifs/dir.c1231
-rw-r--r--fs/ubifs/file.c1290
-rw-r--r--fs/ubifs/find.c977
-rw-r--r--fs/ubifs/gc.c787
-rw-r--r--fs/ubifs/io.c928
-rw-r--r--fs/ubifs/ioctl.c204
-rw-r--r--fs/ubifs/journal.c1441
-rw-r--r--fs/ubifs/key.h533
-rw-r--r--fs/ubifs/log.c807
-rw-r--r--fs/ubifs/lprops.c1357
-rw-r--r--fs/ubifs/lpt.c2243
-rw-r--r--fs/ubifs/lpt_commit.c1648
-rw-r--r--fs/ubifs/master.c387
-rw-r--r--fs/ubifs/misc.h313
-rw-r--r--fs/ubifs/orphan.c958
-rw-r--r--fs/ubifs/recovery.c1519
-rw-r--r--fs/ubifs/replay.c1075
-rw-r--r--fs/ubifs/sb.c629
-rw-r--r--fs/ubifs/scan.c362
-rw-r--r--fs/ubifs/shrinker.c322
-rw-r--r--fs/ubifs/super.c1968
-rw-r--r--fs/ubifs/tnc.c2962
-rw-r--r--fs/ubifs/tnc_commit.c1102
-rw-r--r--fs/ubifs/tnc_misc.c494
-rw-r--r--fs/ubifs/ubifs-media.h745
-rw-r--r--fs/ubifs/ubifs.h1668
-rw-r--r--fs/ubifs/xattr.c571
-rw-r--r--fs/udf/file.c1
-rw-r--r--fs/udf/ialloc.c44
-rw-r--r--fs/udf/super.c61
-rw-r--r--fs/ufs/super.c7
-rw-r--r--fs/utimes.c170
-rw-r--r--fs/vfat/namei.c37
-rw-r--r--fs/xattr.c98
-rw-r--r--fs/xfs/Makefile3
-rw-r--r--fs/xfs/linux-2.6/kmem.c6
-rw-r--r--fs/xfs/linux-2.6/kmem.h6
-rw-r--r--fs/xfs/linux-2.6/sema.h52
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c18
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c33
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h14
-rw-r--r--fs/xfs/linux-2.6/xfs_export.c12
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_fs_subr.c6
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c392
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c534
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.h15
-rw-r--r--fs/xfs/linux-2.6/xfs_linux.h14
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.c8
-rw-r--r--fs/xfs/linux-2.6/xfs_stats.c15
-rw-r--r--fs/xfs/linux-2.6/xfs_stats.h12
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c1108
-rw-r--r--fs/xfs/linux-2.6/xfs_super.h7
-rw-r--r--fs/xfs/linux-2.6/xfs_sysctl.c8
-rw-r--r--fs/xfs/linux-2.6/xfs_sysctl.h4
-rw-r--r--fs/xfs/linux-2.6/xfs_vnode.c72
-rw-r--r--fs/xfs/linux-2.6/xfs_vnode.h154
-rw-r--r--fs/xfs/linux-2.6/xfs_xattr.c330
-rw-r--r--fs/xfs/quota/xfs_dquot.c41
-rw-r--r--fs/xfs/quota/xfs_dquot.h31
-rw-r--r--fs/xfs/quota/xfs_dquot_item.c12
-rw-r--r--fs/xfs/quota/xfs_qm.c38
-rw-r--r--fs/xfs/quota/xfs_qm.h2
-rw-r--r--fs/xfs/quota/xfs_qm_bhv.c7
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c16
-rw-r--r--fs/xfs/quota/xfs_quota_priv.h3
-rw-r--r--fs/xfs/support/ktrace.c4
-rw-r--r--fs/xfs/support/uuid.c8
-rw-r--r--fs/xfs/support/uuid.h1
-rw-r--r--fs/xfs/xfs_acl.c69
-rw-r--r--fs/xfs/xfs_acl.h18
-rw-r--r--fs/xfs/xfs_arch.h68
-rw-r--r--fs/xfs/xfs_attr.c718
-rw-r--r--fs/xfs/xfs_attr.h91
-rw-r--r--fs/xfs/xfs_attr_leaf.c174
-rw-r--r--fs/xfs/xfs_attr_leaf.h31
-rw-r--r--fs/xfs/xfs_attr_sf.h10
-rw-r--r--fs/xfs/xfs_bit.c103
-rw-r--r--fs/xfs/xfs_bit.h34
-rw-r--r--fs/xfs/xfs_bmap.c152
-rw-r--r--fs/xfs/xfs_bmap.h13
-rw-r--r--fs/xfs/xfs_bmap_btree.c76
-rw-r--r--fs/xfs/xfs_btree.c105
-rw-r--r--fs/xfs/xfs_btree.h8
-rw-r--r--fs/xfs/xfs_buf_item.c48
-rw-r--r--fs/xfs/xfs_clnt.h1
-rw-r--r--fs/xfs/xfs_da_btree.c48
-rw-r--r--fs/xfs/xfs_da_btree.h36
-rw-r--r--fs/xfs/xfs_dfrag.c44
-rw-r--r--fs/xfs/xfs_dir2.c125
-rw-r--r--fs/xfs/xfs_dir2.h6
-rw-r--r--fs/xfs/xfs_dir2_block.c56
-rw-r--r--fs/xfs/xfs_dir2_data.c5
-rw-r--r--fs/xfs/xfs_dir2_leaf.c93
-rw-r--r--fs/xfs/xfs_dir2_node.c402
-rw-r--r--fs/xfs/xfs_dir2_sf.c83
-rw-r--r--fs/xfs/xfs_dir2_sf.h6
-rw-r--r--fs/xfs/xfs_dir2_trace.c20
-rw-r--r--fs/xfs/xfs_dmapi.h3
-rw-r--r--fs/xfs/xfs_error.c18
-rw-r--r--fs/xfs/xfs_error.h13
-rw-r--r--fs/xfs/xfs_extfree_item.c6
-rw-r--r--fs/xfs/xfs_filestream.c6
-rw-r--r--fs/xfs/xfs_fs.h4
-rw-r--r--fs/xfs/xfs_fsops.c4
-rw-r--r--fs/xfs/xfs_ialloc_btree.c30
-rw-r--r--fs/xfs/xfs_iget.c48
-rw-r--r--fs/xfs/xfs_inode.c235
-rw-r--r--fs/xfs/xfs_inode.h49
-rw-r--r--fs/xfs/xfs_inode_item.c18
-rw-r--r--fs/xfs/xfs_iomap.c10
-rw-r--r--fs/xfs/xfs_itable.c10
-rw-r--r--fs/xfs/xfs_log.c215
-rw-r--r--fs/xfs/xfs_log.h2
-rw-r--r--fs/xfs/xfs_log_priv.h21
-rw-r--r--fs/xfs/xfs_log_recover.c28
-rw-r--r--fs/xfs/xfs_mount.c188
-rw-r--r--fs/xfs/xfs_mount.h32
-rw-r--r--fs/xfs/xfs_mru_cache.c21
-rw-r--r--fs/xfs/xfs_rename.c22
-rw-r--r--fs/xfs/xfs_rtalloc.c21
-rw-r--r--fs/xfs/xfs_rw.c2
-rw-r--r--fs/xfs/xfs_sb.h17
-rw-r--r--fs/xfs/xfs_trans.c79
-rw-r--r--fs/xfs/xfs_trans.h12
-rw-r--r--fs/xfs/xfs_trans_buf.c12
-rw-r--r--fs/xfs/xfs_trans_inode.c2
-rw-r--r--fs/xfs/xfs_trans_item.c74
-rw-r--r--fs/xfs/xfs_utils.c4
-rw-r--r--fs/xfs/xfs_utils.h3
-rw-r--r--fs/xfs/xfs_vfsops.c623
-rw-r--r--fs/xfs/xfs_vfsops.h5
-rw-r--r--fs/xfs/xfs_vnodeops.c934
-rw-r--r--fs/xfs/xfs_vnodeops.h12
582 files changed, 65598 insertions, 20376 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 047c791427a..c061c3f18e7 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -55,7 +55,7 @@ enum {
Opt_err
};
-static match_table_t tokens = {
+static const match_table_t tokens = {
{Opt_debug, "debug=%x"},
{Opt_dfltuid, "dfltuid=%u"},
{Opt_dfltgid, "dfltgid=%u"},
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index fd01d90cada..57997fa14e6 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -51,4 +51,4 @@ int v9fs_dir_release(struct inode *inode, struct file *filp);
int v9fs_file_open(struct inode *inode, struct file *file);
void v9fs_inode2stat(struct inode *inode, struct p9_stat *stat);
void v9fs_dentry_release(struct dentry *);
-int v9fs_uflags2omode(int uflags);
+int v9fs_uflags2omode(int uflags, int extended);
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 88e3787c6ea..e298fe19409 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -119,6 +119,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
const struct file_operations v9fs_dir_operations = {
.read = generic_read_dir,
+ .llseek = generic_file_llseek,
.readdir = v9fs_dir_readdir,
.open = v9fs_file_open,
.release = v9fs_dir_release,
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 0d55affe37d..52944d2249a 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -59,7 +59,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p \n", inode, file);
v9ses = v9fs_inode2v9ses(inode);
- omode = v9fs_uflags2omode(file->f_flags);
+ omode = v9fs_uflags2omode(file->f_flags, v9fs_extended(v9ses));
fid = file->private_data;
if (!fid) {
fid = v9fs_fid_clone(file->f_path.dentry);
@@ -75,6 +75,8 @@ int v9fs_file_open(struct inode *inode, struct file *file)
inode->i_size = 0;
inode->i_blocks = 0;
}
+ if ((file->f_flags & O_APPEND) && (!v9fs_extended(v9ses)))
+ generic_file_llseek(file, 0, SEEK_END);
}
file->private_data = fid;
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 40fa807bd92..e83aa5ebe86 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -132,10 +132,10 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
/**
* v9fs_uflags2omode- convert posix open flags to plan 9 mode bits
* @uflags: flags to convert
- *
+ * @extended: if .u extensions are active
*/
-int v9fs_uflags2omode(int uflags)
+int v9fs_uflags2omode(int uflags, int extended)
{
int ret;
@@ -155,14 +155,16 @@ int v9fs_uflags2omode(int uflags)
break;
}
- if (uflags & O_EXCL)
- ret |= P9_OEXCL;
-
if (uflags & O_TRUNC)
ret |= P9_OTRUNC;
- if (uflags & O_APPEND)
- ret |= P9_OAPPEND;
+ if (extended) {
+ if (uflags & O_EXCL)
+ ret |= P9_OEXCL;
+
+ if (uflags & O_APPEND)
+ ret |= P9_OAPPEND;
+ }
return ret;
}
@@ -506,7 +508,7 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
flags = O_RDWR;
fid = v9fs_create(v9ses, dir, dentry, NULL, perm,
- v9fs_uflags2omode(flags));
+ v9fs_uflags2omode(flags, v9fs_extended(v9ses)));
if (IS_ERR(fid)) {
err = PTR_ERR(fid);
fid = NULL;
@@ -624,8 +626,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
return NULL;
error:
- if (fid)
- p9_client_clunk(fid);
+ p9_client_clunk(fid);
return ERR_PTR(result);
}
diff --git a/fs/Kconfig b/fs/Kconfig
index cf12c403b8c..9e9d70c02a0 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -136,37 +136,51 @@ config EXT3_FS_SECURITY
If you are not using a security module that requires using
extended attributes for file security labels, say N.
-config EXT4DEV_FS
- tristate "Ext4dev/ext4 extended fs support development (EXPERIMENTAL)"
- depends on EXPERIMENTAL
+config EXT4_FS
+ tristate "The Extended 4 (ext4) filesystem"
select JBD2
select CRC16
help
- Ext4dev is a predecessor filesystem of the next generation
- extended fs ext4, based on ext3 filesystem code. It will be
- renamed ext4 fs later, once ext4dev is mature and stabilized.
+ This is the next generation of the ext3 filesystem.
Unlike the change from ext2 filesystem to ext3 filesystem,
- the on-disk format of ext4dev is not the same as ext3 any more:
- it is based on extent maps and it supports 48-bit physical block
- numbers. These combined on-disk format changes will allow
- ext4dev/ext4 to handle more than 16 TB filesystem volumes --
- a hard limit that ext3 cannot overcome without changing the
- on-disk format.
-
- Other than extent maps and 48-bit block numbers, ext4dev also is
- likely to have other new features such as persistent preallocation,
- high resolution time stamps, and larger file support etc. These
- features will be added to ext4dev gradually.
+ the on-disk format of ext4 is not forwards compatible with
+ ext3; it is based on extent maps and it supports 48-bit
+ physical block numbers. The ext4 filesystem also supports delayed
+ allocation, persistent preallocation, high resolution time stamps,
+ and a number of other features to improve performance and speed
+ up fsck time. For more information, please see the web pages at
+ http://ext4.wiki.kernel.org.
+
+ The ext4 filesystem will support mounting an ext3
+ filesystem; while there will be some performance gains from
+ the delayed allocation and inode table readahead, the best
+ performance gains will require enabling ext4 features in the
+ filesystem, or formating a new filesystem as an ext4
+ filesystem initially.
To compile this file system support as a module, choose M here. The
module will be called ext4dev.
If unsure, say N.
-config EXT4DEV_FS_XATTR
- bool "Ext4dev extended attributes"
- depends on EXT4DEV_FS
+config EXT4DEV_COMPAT
+ bool "Enable ext4dev compatibility"
+ depends on EXT4_FS
+ help
+ Starting with 2.6.28, the name of the ext4 filesystem was
+ renamed from ext4dev to ext4. Unfortunately there are some
+ legacy userspace programs (such as klibc's fstype) have
+ "ext4dev" hardcoded.
+
+ To enable backwards compatibility so that systems that are
+ still expecting to mount ext4 filesystems using ext4dev,
+ chose Y here. This feature will go away by 2.6.31, so
+ please arrange to get your userspace programs fixed!
+
+config EXT4_FS_XATTR
+ bool "Ext4 extended attributes"
+ depends on EXT4_FS
default y
help
Extended attributes are name:value pairs associated with inodes by
@@ -175,11 +189,11 @@ config EXT4DEV_FS_XATTR
If unsure, say N.
- You need this for POSIX ACL support on ext4dev/ext4.
+ You need this for POSIX ACL support on ext4.
-config EXT4DEV_FS_POSIX_ACL
- bool "Ext4dev POSIX Access Control Lists"
- depends on EXT4DEV_FS_XATTR
+config EXT4_FS_POSIX_ACL
+ bool "Ext4 POSIX Access Control Lists"
+ depends on EXT4_FS_XATTR
select FS_POSIX_ACL
help
POSIX Access Control Lists (ACLs) support permissions for users and
@@ -190,14 +204,14 @@ config EXT4DEV_FS_POSIX_ACL
If you don't know what Access Control Lists are, say N
-config EXT4DEV_FS_SECURITY
- bool "Ext4dev Security Labels"
- depends on EXT4DEV_FS_XATTR
+config EXT4_FS_SECURITY
+ bool "Ext4 Security Labels"
+ depends on EXT4_FS_XATTR
help
Security labels support alternative access control models
implemented by security modules like SELinux. This option
enables an extended attribute handler for file security
- labels in the ext4dev/ext4 filesystem.
+ labels in the ext4 filesystem.
If you are not using a security module that requires using
extended attributes for file security labels, say N.
@@ -206,17 +220,16 @@ config JBD
tristate
help
This is a generic journalling layer for block devices. It is
- currently used by the ext3 and OCFS2 file systems, but it could
- also be used to add journal support to other file systems or block
+ currently used by the ext3 file system, but it could also be
+ used to add journal support to other file systems or block
devices such as RAID or LVM.
- If you are using the ext3 or OCFS2 file systems, you need to
- say Y here. If you are not using ext3 OCFS2 then you will probably
- want to say N.
+ If you are using the ext3 file system, you need to say Y here.
+ If you are not using ext3 then you will probably want to say N.
To compile this device as a module, choose M here: the module will be
- called jbd. If you are compiling ext3 or OCFS2 into the kernel,
- you cannot compile this code as a module.
+ called jbd. If you are compiling ext3 into the kernel, you
+ cannot compile this code as a module.
config JBD_DEBUG
bool "JBD (ext3) debugging support"
@@ -240,22 +253,23 @@ config JBD2
help
This is a generic journaling layer for block devices that support
both 32-bit and 64-bit block numbers. It is currently used by
- the ext4dev/ext4 filesystem, but it could also be used to add
+ the ext4 and OCFS2 filesystems, but it could also be used to add
journal support to other file systems or block devices such
as RAID or LVM.
- If you are using ext4dev/ext4, you need to say Y here. If you are not
- using ext4dev/ext4 then you will probably want to say N.
+ If you are using ext4 or OCFS2, you need to say Y here.
+ If you are not using ext4 or OCFS2 then you will
+ probably want to say N.
To compile this device as a module, choose M here. The module will be
- called jbd2. If you are compiling ext4dev/ext4 into the kernel,
+ called jbd2. If you are compiling ext4 or OCFS2 into the kernel,
you cannot compile this code as a module.
config JBD2_DEBUG
- bool "JBD2 (ext4dev/ext4) debugging support"
+ bool "JBD2 (ext4) debugging support"
depends on JBD2 && DEBUG_FS
help
- If you are using the ext4dev/ext4 journaled file system (or
+ If you are using the ext4 journaled file system (or
potentially any other filesystem/device using JBD2), this option
allows you to enable debugging output while the system is running,
in order to help track down any problems you are having.
@@ -270,9 +284,9 @@ config JBD2_DEBUG
config FS_MBCACHE
# Meta block cache for Extended Attributes (ext2/ext3/ext4)
tristate
- depends on EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4DEV_FS_XATTR
- default y if EXT2_FS=y || EXT3_FS=y || EXT4DEV_FS=y
- default m if EXT2_FS=m || EXT3_FS=m || EXT4DEV_FS=m
+ depends on EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR
+ default y if EXT2_FS=y || EXT3_FS=y || EXT4_FS=y
+ default m if EXT2_FS=m || EXT3_FS=m || EXT4_FS=m
config REISERFS_FS
tristate "Reiserfs support"
@@ -419,6 +433,14 @@ config FS_POSIX_ACL
bool
default n
+config FILE_LOCKING
+ bool "Enable POSIX file locking API" if EMBEDDED
+ default y
+ help
+ This option enables standard file locking support, required
+ for filesystems like NFS and for the flock() system
+ call. Disabling this option saves about 11k.
+
source "fs/xfs/Kconfig"
source "fs/gfs2/Kconfig"
@@ -426,7 +448,7 @@ config OCFS2_FS
tristate "OCFS2 file system support"
depends on NET && SYSFS
select CONFIGFS_FS
- select JBD
+ select JBD2
select CRC32
help
OCFS2 is a general purpose extent based shared disk cluster file
@@ -470,6 +492,14 @@ config OCFS2_FS_USERSPACE_CLUSTER
It is safe to say Y, as the clustering method is run-time
selectable.
+config OCFS2_FS_STATS
+ bool "OCFS2 statistics"
+ depends on OCFS2_FS
+ default y
+ help
+ This option allows some fs statistics to be captured. Enabling
+ this option may increase the memory consumption.
+
config OCFS2_DEBUG_MASKLOG
bool "OCFS2 logging support"
depends on OCFS2_FS
@@ -489,6 +519,16 @@ config OCFS2_DEBUG_FS
this option for debugging only as it is likely to decrease
performance of the filesystem.
+config OCFS2_COMPAT_JBD
+ bool "Use JBD for compatibility"
+ depends on OCFS2_FS
+ default n
+ select JBD
+ help
+ The ocfs2 filesystem now uses JBD2 for its journalling. JBD2
+ is backwards compatible with JBD. It is safe to say N here.
+ However, if you really want to use the original JBD, say Y here.
+
endif # BLOCK
config DNOTIFY
@@ -830,7 +870,7 @@ config NTFS_FS
from the project web site.
For more information see <file:Documentation/filesystems/ntfs.txt>
- and <http://linux-ntfs.sourceforge.net/>.
+ and <http://www.linux-ntfs.org/>.
To compile this file system support as a module, choose M here: the
module will be called ntfs.
@@ -894,65 +934,7 @@ endif # BLOCK
menu "Pseudo filesystems"
-config PROC_FS
- bool "/proc file system support" if EMBEDDED
- default y
- help
- This is a virtual file system providing information about the status
- of the system. "Virtual" means that it doesn't take up any space on
- your hard disk: the files are created on the fly by the kernel when
- you try to access them. Also, you cannot read the files with older
- version of the program less: you need to use more or cat.
-
- It's totally cool; for example, "cat /proc/interrupts" gives
- information about what the different IRQs are used for at the moment
- (there is a small number of Interrupt ReQuest lines in your computer
- that are used by the attached devices to gain the CPU's attention --
- often a source of trouble if two devices are mistakenly configured
- to use the same IRQ). The program procinfo to display some
- information about your system gathered from the /proc file system.
-
- Before you can use the /proc file system, it has to be mounted,
- meaning it has to be given a location in the directory hierarchy.
- That location should be /proc. A command such as "mount -t proc proc
- /proc" or the equivalent line in /etc/fstab does the job.
-
- The /proc file system is explained in the file
- <file:Documentation/filesystems/proc.txt> and on the proc(5) manpage
- ("man 5 proc").
-
- This option will enlarge your kernel by about 67 KB. Several
- programs depend on this, so everyone should say Y here.
-
-config PROC_KCORE
- bool "/proc/kcore support" if !ARM
- depends on PROC_FS && MMU
-
-config PROC_VMCORE
- bool "/proc/vmcore support (EXPERIMENTAL)"
- depends on PROC_FS && EXPERIMENTAL && CRASH_DUMP
- default y
- help
- Exports the dump image of crashed kernel in ELF format.
-
-config PROC_SYSCTL
- bool "Sysctl support (/proc/sys)" if EMBEDDED
- depends on PROC_FS
- select SYSCTL
- default y
- ---help---
- The sysctl interface provides a means of dynamically changing
- certain kernel parameters and variables on the fly without requiring
- a recompile of the kernel or reboot of the system. The primary
- interface is through /proc/sys. If you say Y here a tree of
- modifiable sysctl entries will be generated beneath the
- /proc/sys directory. They are explained in the files
- in <file:Documentation/sysctl/>. Note that enabling this
- option will enlarge the kernel by at least 8 KB.
-
- As it is generally a good thing, you should say Y here unless
- building a kernel for install/rescue disks or your system is very
- limited in memory.
+source "fs/proc/Kconfig"
config SYSFS
bool "sysfs file system support" if EMBEDDED
@@ -1375,6 +1357,9 @@ config JFFS2_CMODE_FAVOURLZO
endchoice
+# UBIFS File system configuration
+source "fs/ubifs/Kconfig"
+
config CRAMFS
tristate "Compressed ROM file system support (cramfs)"
depends on BLOCK
@@ -1430,6 +1415,19 @@ config MINIX_FS
partition (the one containing the directory /) cannot be compiled as
a module.
+config OMFS_FS
+ tristate "SonicBlue Optimized MPEG File System support"
+ depends on BLOCK
+ select CRC_ITU_T
+ help
+ This is the proprietary file system used by the Rio Karma music
+ player and ReplayTV DVR. Despite the name, this filesystem is not
+ more efficient than a standard FS for MPEG files, in fact likely
+ the opposite is true. Say Y if you have either of these devices
+ and wish to mount its disk.
+
+ To compile this file system support as a module, choose M here: the
+ module will be called omfs. If unsure, say N.
config HPFS_FS
tristate "OS/2 HPFS file system support"
@@ -1544,10 +1542,6 @@ config UFS_FS
The recently released UFS2 variant (used in FreeBSD 5.x) is
READ-ONLY supported.
- If you only intend to mount files from some other Unix over the
- network using NFS, you don't need the UFS file system support (but
- you need NFS file system support obviously).
-
Note that this option is generally not needed for floppies, since a
good portable way to transport files and directories between unixes
(and even other operating systems) is given by the tar program ("man
@@ -1587,6 +1581,7 @@ menuconfig NETWORK_FILESYSTEMS
Say Y here to get to see options for network filesystems and
filesystem-related networking code, such as NFS daemon and
RPCSEC security modules.
+
This option alone does not add any kernel code.
If you say N, all options in this submenu will be skipped and
@@ -1595,76 +1590,92 @@ menuconfig NETWORK_FILESYSTEMS
if NETWORK_FILESYSTEMS
config NFS_FS
- tristate "NFS file system support"
+ tristate "NFS client support"
depends on INET
select LOCKD
select SUNRPC
select NFS_ACL_SUPPORT if NFS_V3_ACL
help
- If you are connected to some other (usually local) Unix computer
- (using SLIP, PLIP, PPP or Ethernet) and want to mount files residing
- on that computer (the NFS server) using the Network File Sharing
- protocol, say Y. "Mounting files" means that the client can access
- the files with usual UNIX commands as if they were sitting on the
- client's hard disk. For this to work, the server must run the
- programs nfsd and mountd (but does not need to have NFS file system
- support enabled in its kernel). NFS is explained in the Network
- Administrator's Guide, available from
- <http://www.tldp.org/docs.html#guide>, on its man page: "man
- nfs", and in the NFS-HOWTO.
-
- A superior but less widely used alternative to NFS is provided by
- the Coda file system; see "Coda file system support" below.
+ Choose Y here if you want to access files residing on other
+ computers using Sun's Network File System protocol. To compile
+ this file system support as a module, choose M here: the module
+ will be called nfs.
- If you say Y here, you should have said Y to TCP/IP networking also.
- This option would enlarge your kernel by about 27 KB.
+ To mount file systems exported by NFS servers, you also need to
+ install the user space mount.nfs command which can be found in
+ the Linux nfs-utils package, available from http://linux-nfs.org/.
+ Information about using the mount command is available in the
+ mount(8) man page. More detail about the Linux NFS client
+ implementation is available via the nfs(5) man page.
- To compile this file system support as a module, choose M here: the
- module will be called nfs.
+ Below you can choose which versions of the NFS protocol are
+ available in the kernel to mount NFS servers. Support for NFS
+ version 2 (RFC 1094) is always available when NFS_FS is selected.
- If you are configuring a diskless machine which will mount its root
- file system over NFS at boot time, say Y here and to "Kernel
- level IP autoconfiguration" above and to "Root file system on NFS"
- below. You cannot compile this driver as a module in this case.
- There are two packages designed for booting diskless machines over
- the net: netboot, available from
- <http://ftp1.sourceforge.net/netboot/>, and Etherboot,
- available from <http://ftp1.sourceforge.net/etherboot/>.
+ To configure a system which mounts its root file system via NFS
+ at boot time, say Y here, select "Kernel level IP
+ autoconfiguration" in the NETWORK menu, and select "Root file
+ system on NFS" below. You cannot compile this file system as a
+ module in this case.
- If you don't know what all this is about, say N.
+ If unsure, say N.
config NFS_V3
- bool "Provide NFSv3 client support"
+ bool "NFS client support for NFS version 3"
depends on NFS_FS
help
- Say Y here if you want your NFS client to be able to speak version
- 3 of the NFS protocol.
+ This option enables support for version 3 of the NFS protocol
+ (RFC 1813) in the kernel's NFS client.
If unsure, say Y.
config NFS_V3_ACL
- bool "Provide client support for the NFSv3 ACL protocol extension"
+ bool "NFS client support for the NFSv3 ACL protocol extension"
depends on NFS_V3
help
- Implement the NFSv3 ACL protocol extension for manipulating POSIX
- Access Control Lists. The server should also be compiled with
- the NFSv3 ACL protocol extension; see the CONFIG_NFSD_V3_ACL option.
+ Some NFS servers support an auxiliary NFSv3 ACL protocol that
+ Sun added to Solaris but never became an official part of the
+ NFS version 3 protocol. This protocol extension allows
+ applications on NFS clients to manipulate POSIX Access Control
+ Lists on files residing on NFS servers. NFS servers enforce
+ ACLs on local files whether this protocol is available or not.
+
+ Choose Y here if your NFS server supports the Solaris NFSv3 ACL
+ protocol extension and you want your NFS client to allow
+ applications to access and modify ACLs on files on the server.
+
+ Most NFS servers don't support the Solaris NFSv3 ACL protocol
+ extension. You can choose N here or specify the "noacl" mount
+ option to prevent your NFS client from trying to use the NFSv3
+ ACL protocol.
If unsure, say N.
config NFS_V4
- bool "Provide NFSv4 client support (EXPERIMENTAL)"
+ bool "NFS client support for NFS version 4 (EXPERIMENTAL)"
depends on NFS_FS && EXPERIMENTAL
select RPCSEC_GSS_KRB5
help
- Say Y here if you want your NFS client to be able to speak the newer
- version 4 of the NFS protocol.
+ This option enables support for version 4 of the NFS protocol
+ (RFC 3530) in the kernel's NFS client.
- Note: Requires auxiliary userspace daemons which may be found on
- http://www.citi.umich.edu/projects/nfsv4/
+ To mount NFS servers using NFSv4, you also need to install user
+ space programs which can be found in the Linux nfs-utils package,
+ available from http://linux-nfs.org/.
If unsure, say N.
+config ROOT_NFS
+ bool "Root file system on NFS"
+ depends on NFS_FS=y && IP_PNP
+ help
+ If you want your system to mount its root file system via NFS,
+ choose Y here. This is common practice for managing systems
+ without local permanent storage. For details, read
+ <file:Documentation/filesystems/nfsroot.txt>.
+
+ Most people say N here.
+
config NFSD
tristate "NFS server support"
depends on INET
@@ -1746,20 +1757,6 @@ config NFSD_V4
If unsure, say N.
-config ROOT_NFS
- bool "Root file system on NFS"
- depends on NFS_FS=y && IP_PNP
- help
- If you want your Linux box to mount its whole root file system (the
- one containing the directory /) from some other computer over the
- net via NFS (presumably because your box doesn't have a hard disk),
- say Y. Read <file:Documentation/filesystems/nfsroot.txt> for
- details. It is likely that in this case, you also want to say Y to
- "Kernel level IP autoconfiguration" so that your box can discover
- its network address at boot time.
-
- Most people say N here.
-
config LOCKD
tristate
@@ -1800,26 +1797,27 @@ config SUNRPC_XPRT_RDMA
If unsure, say N.
-config SUNRPC_BIND34
- bool "Support for rpcbind versions 3 & 4 (EXPERIMENTAL)"
+config SUNRPC_REGISTER_V4
+ bool "Register local RPC services via rpcbind v4 (EXPERIMENTAL)"
depends on SUNRPC && EXPERIMENTAL
default n
help
- RPC requests over IPv6 networks require support for larger
- addresses when performing an RPC bind. Sun added support for
- IPv6 addressing by creating two new versions of the rpcbind
- protocol (RFC 1833).
+ Sun added support for registering RPC services at an IPv6
+ address by creating two new versions of the rpcbind protocol
+ (RFC 1833).
+
+ This option enables support in the kernel RPC server for
+ registering kernel RPC services via version 4 of the rpcbind
+ protocol. If you enable this option, you must run a portmapper
+ daemon that supports rpcbind protocol version 4.
- This option enables support in the kernel RPC client for
- querying rpcbind servers via versions 3 and 4 of the rpcbind
- protocol. The kernel automatically falls back to version 2
- if a remote rpcbind service does not support versions 3 or 4.
- By themselves, these new versions do not provide support for
- RPC over IPv6, but the new protocol versions are necessary to
- support it.
+ Serving NFS over IPv6 from knfsd (the kernel's NFS server)
+ requires that you enable this option and use a portmapper that
+ supports rpcbind version 4.
- If unsure, say N to get traditional behavior (version 2 rpcbind
- requests only).
+ If unsure, say N to get traditional behavior (register kernel
+ RPC services using only rpcbind version 2). Distributions
+ using the legacy Linux portmapper daemon must say N here.
config RPCSEC_GSS_KRB5
tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)"
@@ -1986,6 +1984,16 @@ config CIFS_WEAK_PW_HASH
If unsure, say N.
+config CIFS_UPCALL
+ bool "Kerberos/SPNEGO advanced session setup"
+ depends on CIFS && KEYS
+ help
+ Enables an upcall mechanism for CIFS which accesses
+ userspace helper utilities to provide SPNEGO packaged (RFC 4178)
+ Kerberos tickets which are needed to mount to certain secure servers
+ (for which more secure Kerberos authentication is required). If
+ unsure, say N.
+
config CIFS_XATTR
bool "CIFS extended attributes"
depends on CIFS
@@ -2038,17 +2046,6 @@ config CIFS_EXPERIMENTAL
(which is disabled by default). See the file fs/cifs/README
for more details. If unsure, say N.
-config CIFS_UPCALL
- bool "Kerberos/SPNEGO advanced session setup (EXPERIMENTAL)"
- depends on CIFS_EXPERIMENTAL
- depends on KEYS
- help
- Enables an upcall mechanism for CIFS which accesses
- userspace helper utilities to provide SPNEGO packaged (RFC 4178)
- Kerberos tickets which are needed to mount to certain secure servers
- (for which more secure Kerberos authentication is required). If
- unsure, say N.
-
config CIFS_DFS_UPCALL
bool "DFS feature support (EXPERIMENTAL)"
depends on CIFS_EXPERIMENTAL
@@ -2104,20 +2101,6 @@ config CODA_FS
To compile the coda client support as a module, choose M here: the
module will be called coda.
-config CODA_FS_OLD_API
- bool "Use 96-bit Coda file identifiers"
- depends on CODA_FS
- help
- A new kernel-userspace API had to be introduced for Coda v6.0
- to support larger 128-bit file identifiers as needed by the
- new realms implementation.
-
- However this new API is not backward compatible with older
- clients. If you really need to run the old Coda userspace
- cache manager then say Y.
-
- For most cases you probably want to say N.
-
config AFS_FS
tristate "Andrew File System support (AFS) (EXPERIMENTAL)"
depends on INET && EXPERIMENTAL
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 3263084eef9..17c9c5ec14c 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -30,7 +30,7 @@ config COMPAT_BINFMT_ELF
config BINFMT_ELF_FDPIC
bool "Kernel support for FDPIC ELF binaries"
default y
- depends on (FRV || BLACKFIN)
+ depends on (FRV || BLACKFIN || (SUPERH32 && !MMU))
help
ELF FDPIC binaries are based on ELF, but allow the individual load
segments of a binary to be located in memory independently of each
@@ -59,10 +59,12 @@ config BINFMT_SHARED_FLAT
help
Support FLAT shared libraries
+config HAVE_AOUT
+ def_bool n
+
config BINFMT_AOUT
tristate "Kernel support for a.out and ECOFF binaries"
- depends on ARCH_SUPPORTS_AOUT && \
- (X86_32 || ALPHA || ARM || M68K)
+ depends on HAVE_AOUT
---help---
A.out (Assembler.OUTput) is a set of formats for libraries and
executables used in the earliest versions of UNIX. Linux used
diff --git a/fs/Makefile b/fs/Makefile
index 1e7a11bd4da..b6f27dc26b7 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -7,7 +7,7 @@
obj-y := open.o read_write.o file_table.o super.o \
char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
- ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \
+ ioctl.o readdir.o select.o fifo.o dcache.o inode.o \
attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
seq_file.o xattr.o libfs.o fs-writeback.o \
pnode.o drop_caches.o splice.o sync.o utimes.o \
@@ -19,6 +19,7 @@ else
obj-y += no-block.o
endif
+obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
obj-$(CONFIG_INOTIFY) += inotify.o
obj-$(CONFIG_INOTIFY_USER) += inotify_user.o
obj-$(CONFIG_EPOLL) += eventpoll.o
@@ -26,6 +27,7 @@ obj-$(CONFIG_ANON_INODES) += anon_inodes.o
obj-$(CONFIG_SIGNALFD) += signalfd.o
obj-$(CONFIG_TIMERFD) += timerfd.o
obj-$(CONFIG_EVENTFD) += eventfd.o
+obj-$(CONFIG_FILE_LOCKING) += locks.o
obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o
nfsd-$(CONFIG_NFSD) := nfsctl.o
@@ -68,7 +70,7 @@ obj-$(CONFIG_DLM) += dlm/
# Do not add any filesystems before this line
obj-$(CONFIG_REISERFS_FS) += reiserfs/
obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3
-obj-$(CONFIG_EXT4DEV_FS) += ext4/ # Before ext2 so root fs can be ext4dev
+obj-$(CONFIG_EXT4_FS) += ext4/ # Before ext2 so root fs can be ext4dev
obj-$(CONFIG_JBD) += jbd/
obj-$(CONFIG_JBD2) += jbd2/
obj-$(CONFIG_EXT2_FS) += ext2/
@@ -100,6 +102,7 @@ obj-$(CONFIG_NTFS_FS) += ntfs/
obj-$(CONFIG_UFS_FS) += ufs/
obj-$(CONFIG_EFS_FS) += efs/
obj-$(CONFIG_JFFS2_FS) += jffs2/
+obj-$(CONFIG_UBIFS_FS) += ubifs/
obj-$(CONFIG_AFFS_FS) += affs/
obj-$(CONFIG_ROMFS_FS) += romfs/
obj-$(CONFIG_QNX4FS_FS) += qnx4/
@@ -109,6 +112,7 @@ obj-$(CONFIG_ADFS_FS) += adfs/
obj-$(CONFIG_FUSE_FS) += fuse/
obj-$(CONFIG_UDF_FS) += udf/
obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/
+obj-$(CONFIG_OMFS_FS) += omfs/
obj-$(CONFIG_JFS_FS) += jfs/
obj-$(CONFIG_XFS_FS) += xfs/
obj-$(CONFIG_9P_FS) += 9p/
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index fc1a8dc64d7..85a30e92980 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -197,6 +197,7 @@ out:
const struct file_operations adfs_dir_operations = {
.read = generic_read_dir,
+ .llseek = generic_file_llseek,
.readdir = adfs_readdir,
.fsync = file_fsync,
};
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 9e421eeb672..7f83a46f2b7 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -157,7 +157,7 @@ static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt)
enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_err};
-static match_table_t tokens = {
+static const match_table_t tokens = {
{Opt_uid, "uid=%u"},
{Opt_gid, "gid=%u"},
{Opt_ownmask, "ownmask=%o"},
@@ -249,7 +249,7 @@ static void adfs_destroy_inode(struct inode *inode)
kmem_cache_free(adfs_inode_cachep, ADFS_I(inode));
}
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
{
struct adfs_inode_info *ei = (struct adfs_inode_info *) foo;
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 223b1917093..e9ec915f755 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -2,6 +2,7 @@
#include <linux/fs.h>
#include <linux/buffer_head.h>
#include <linux/amigaffs.h>
+#include <linux/mutex.h>
/* AmigaOS allows file names with up to 30 characters length.
* Names longer than that will be silently truncated. If you
@@ -98,7 +99,7 @@ struct affs_sb_info {
gid_t s_gid; /* gid to override */
umode_t s_mode; /* mode to override */
struct buffer_head *s_root_bh; /* Cached root block. */
- struct semaphore s_bmlock; /* Protects bitmap access. */
+ struct mutex s_bmlock; /* Protects bitmap access. */
struct affs_bm_info *s_bitmap; /* Bitmap infos. */
u32 s_bmap_count; /* # of bitmap blocks. */
u32 s_bmap_bits; /* # of bits in one bitmap blocks */
diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c
index c4a5ad09ddf..dc5ef14bdc1 100644
--- a/fs/affs/bitmap.c
+++ b/fs/affs/bitmap.c
@@ -45,14 +45,14 @@ affs_count_free_blocks(struct super_block *sb)
if (sb->s_flags & MS_RDONLY)
return 0;
- down(&AFFS_SB(sb)->s_bmlock);
+ mutex_lock(&AFFS_SB(sb)->s_bmlock);
bm = AFFS_SB(sb)->s_bitmap;
free = 0;
for (i = AFFS_SB(sb)->s_bmap_count; i > 0; bm++, i--)
free += bm->bm_free;
- up(&AFFS_SB(sb)->s_bmlock);
+ mutex_unlock(&AFFS_SB(sb)->s_bmlock);
return free;
}
@@ -76,7 +76,7 @@ affs_free_block(struct super_block *sb, u32 block)
bit = blk % sbi->s_bmap_bits;
bm = &sbi->s_bitmap[bmap];
- down(&sbi->s_bmlock);
+ mutex_lock(&sbi->s_bmlock);
bh = sbi->s_bmap_bh;
if (sbi->s_last_bmap != bmap) {
@@ -105,19 +105,19 @@ affs_free_block(struct super_block *sb, u32 block)
sb->s_dirt = 1;
bm->bm_free++;
- up(&sbi->s_bmlock);
+ mutex_unlock(&sbi->s_bmlock);
return;
err_free:
affs_warning(sb,"affs_free_block","Trying to free block %u which is already free", block);
- up(&sbi->s_bmlock);
+ mutex_unlock(&sbi->s_bmlock);
return;
err_bh_read:
affs_error(sb,"affs_free_block","Cannot read bitmap block %u", bm->bm_key);
sbi->s_bmap_bh = NULL;
sbi->s_last_bmap = ~0;
- up(&sbi->s_bmlock);
+ mutex_unlock(&sbi->s_bmlock);
return;
err_range:
@@ -168,7 +168,7 @@ affs_alloc_block(struct inode *inode, u32 goal)
bmap = blk / sbi->s_bmap_bits;
bm = &sbi->s_bitmap[bmap];
- down(&sbi->s_bmlock);
+ mutex_lock(&sbi->s_bmlock);
if (bm->bm_free)
goto find_bmap_bit;
@@ -249,7 +249,7 @@ find_bit:
mark_buffer_dirty(bh);
sb->s_dirt = 1;
- up(&sbi->s_bmlock);
+ mutex_unlock(&sbi->s_bmlock);
pr_debug("%d\n", blk);
return blk;
@@ -259,7 +259,7 @@ err_bh_read:
sbi->s_bmap_bh = NULL;
sbi->s_last_bmap = ~0;
err_full:
- up(&sbi->s_bmlock);
+ mutex_unlock(&sbi->s_bmlock);
pr_debug("failed\n");
return 0;
}
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index 6e3f282424b..7b36904dbea 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -19,6 +19,7 @@ static int affs_readdir(struct file *, void *, filldir_t);
const struct file_operations affs_dir_operations = {
.read = generic_read_dir,
+ .llseek = generic_file_llseek,
.readdir = affs_readdir,
.fsync = file_fsync,
};
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 6eac7bdeec9..1377b1240b6 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -46,8 +46,6 @@ const struct inode_operations affs_file_inode_operations = {
static int
affs_file_open(struct inode *inode, struct file *filp)
{
- if (atomic_read(&filp->f_count) != 1)
- return 0;
pr_debug("AFFS: open(%lu,%d)\n",
inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt));
atomic_inc(&AFFS_I(inode)->i_opencnt);
@@ -57,8 +55,6 @@ affs_file_open(struct inode *inode, struct file *filp)
static int
affs_file_release(struct inode *inode, struct file *filp)
{
- if (atomic_read(&filp->f_count) != 0)
- return 0;
pr_debug("AFFS: release(%lu, %d)\n",
inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt));
diff --git a/fs/affs/super.c b/fs/affs/super.c
index d214837d5e4..8989c93193e 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -90,7 +90,7 @@ static void affs_destroy_inode(struct inode *inode)
kmem_cache_free(affs_inode_cachep, AFFS_I(inode));
}
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
{
struct affs_inode_info *ei = (struct affs_inode_info *) foo;
@@ -135,7 +135,7 @@ enum {
Opt_verbose, Opt_volume, Opt_ignore, Opt_err,
};
-static match_table_t tokens = {
+static const match_table_t tokens = {
{Opt_bs, "bs=%u"},
{Opt_mode, "mode=%o"},
{Opt_mufs, "mufs"},
@@ -290,7 +290,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
if (!sbi)
return -ENOMEM;
sb->s_fs_info = sbi;
- init_MUTEX(&sbi->s_bmlock);
+ mutex_init(&sbi->s_bmlock);
if (!parse_options(data,&uid,&gid,&i,&reserved,&root_block,
&blocksize,&sbi->s_prefix,
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 7102824ba84..3cb6920ff30 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -469,8 +469,6 @@ extern bool afs_cm_incoming_call(struct afs_call *);
extern const struct inode_operations afs_dir_inode_operations;
extern const struct file_operations afs_dir_file_operations;
-extern int afs_permission(struct inode *, int, struct nameidata *);
-
/*
* file.c
*/
@@ -605,7 +603,7 @@ extern void afs_clear_permits(struct afs_vnode *);
extern void afs_cache_permit(struct afs_vnode *, struct key *, long);
extern void afs_zap_permits(struct rcu_head *);
extern struct key *afs_request_key(struct afs_cell *);
-extern int afs_permission(struct inode *, int, struct nameidata *);
+extern int afs_permission(struct inode *, int);
/*
* server.c
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 2f5503902c3..78db4953a80 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -232,7 +232,7 @@ static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd)
}
mntget(newmnt);
- err = do_add_mount(newmnt, nd, MNT_SHRINKABLE, &afs_vfsmounts);
+ err = do_add_mount(newmnt, &nd->path, MNT_SHRINKABLE, &afs_vfsmounts);
switch (err) {
case 0:
path_put(&nd->path);
diff --git a/fs/afs/security.c b/fs/afs/security.c
index 3bcbeceba1b..3ef50437003 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -284,7 +284,7 @@ static int afs_check_permit(struct afs_vnode *vnode, struct key *key,
* - AFS ACLs are attached to directories only, and a file is controlled by its
* parent directory's ACL
*/
-int afs_permission(struct inode *inode, int mask, struct nameidata *nd)
+int afs_permission(struct inode *inode, int mask)
{
struct afs_vnode *vnode = AFS_FS_I(inode);
afs_access_t uninitialized_var(access);
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 7e3faeef681..aee239a048c 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -27,7 +27,7 @@
#define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */
-static void afs_i_init_once(struct kmem_cache *cachep, void *foo);
+static void afs_i_init_once(void *foo);
static int afs_get_sb(struct file_system_type *fs_type,
int flags, const char *dev_name,
void *data, struct vfsmount *mnt);
@@ -64,7 +64,7 @@ enum {
afs_opt_vol,
};
-static match_table_t afs_options_list = {
+static const match_table_t afs_options_list = {
{ afs_opt_cell, "cell=%s" },
{ afs_opt_rwpath, "rwpath" },
{ afs_opt_vol, "vol=%s" },
@@ -449,7 +449,7 @@ static void afs_put_super(struct super_block *sb)
/*
* initialise an inode cache slab element prior to any use
*/
-static void afs_i_init_once(struct kmem_cache *cachep, void *_vnode)
+static void afs_i_init_once(void *_vnode)
{
struct afs_vnode *vnode = _vnode;
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 9a849ad3c48..065b4e10681 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -404,7 +404,7 @@ static int afs_write_back_from_locked_page(struct afs_writeback *wb,
page = pages[loop];
if (page->index > wb->last)
break;
- if (TestSetPageLocked(page))
+ if (!trylock_page(page))
break;
if (!PageDirty(page) ||
page_private(page) != (unsigned long) wb) {
diff --git a/fs/aio.c b/fs/aio.c
index 0fb3117ddd9..f658441d566 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -512,8 +512,8 @@ static void aio_fput_routine(struct work_struct *data)
*/
static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
{
- dprintk(KERN_DEBUG "aio_put(%p): f_count=%d\n",
- req, atomic_read(&req->ki_filp->f_count));
+ dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n",
+ req, atomic_long_read(&req->ki_filp->f_count));
assert_spin_locked(&ctx->ctx_lock);
@@ -528,7 +528,7 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
/* Must be done under the lock to serialise against cancellation.
* Call this aio_fput as it duplicates fput via the fput_work.
*/
- if (unlikely(atomic_dec_and_test(&req->ki_filp->f_count))) {
+ if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) {
get_ioctx(ctx);
spin_lock(&fput_lock);
list_add(&req->ki_list, &fput_head);
@@ -586,7 +586,6 @@ static void use_mm(struct mm_struct *mm)
struct task_struct *tsk = current;
task_lock(tsk);
- tsk->flags |= PF_BORROWED_MM;
active_mm = tsk->active_mm;
atomic_inc(&mm->mm_count);
tsk->mm = mm;
@@ -610,7 +609,6 @@ static void unuse_mm(struct mm_struct *mm)
struct task_struct *tsk = current;
task_lock(tsk);
- tsk->flags &= ~PF_BORROWED_MM;
tsk->mm = NULL;
/* active_mm is still 'mm' */
enter_lazy_tlb(mm, tsk);
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 977ef208c05..3662dd44896 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -58,8 +58,9 @@ static struct dentry_operations anon_inodefs_dentry_operations = {
* of the file
*
* @name: [in] name of the "class" of the new file
- * @fops [in] file operations for the new file
- * @priv [in] private data for the new file (will be file's private_data)
+ * @fops: [in] file operations for the new file
+ * @priv: [in] private data for the new file (will be file's private_data)
+ * @flags: [in] flags
*
* Creates a new file by hooking it on a single inode. This is useful for files
* that do not need to have a full-fledged inode in order to operate correctly.
@@ -68,7 +69,7 @@ static struct dentry_operations anon_inodefs_dentry_operations = {
* setup. Returns new descriptor or -error.
*/
int anon_inode_getfd(const char *name, const struct file_operations *fops,
- void *priv)
+ void *priv, int flags)
{
struct qstr this;
struct dentry *dentry;
@@ -78,7 +79,7 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops,
if (IS_ERR(anon_inode_inode))
return -ENODEV;
- error = get_unused_fd();
+ error = get_unused_fd_flags(flags);
if (error < 0)
return error;
fd = error;
@@ -115,7 +116,7 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops,
file->f_mapping = anon_inode_inode->i_mapping;
file->f_pos = 0;
- file->f_flags = O_RDWR;
+ file->f_flags = O_RDWR | (flags & O_NONBLOCK);
file->f_version = 0;
file->private_data = priv;
diff --git a/fs/attr.c b/fs/attr.c
index 966b73e25f8..26c71ba1eed 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -51,7 +51,7 @@ int inode_change_ok(struct inode *inode, struct iattr *attr)
}
/* Check for setting the inode time. */
- if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET)) {
+ if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) {
if (!is_owner_or_cap(inode))
goto error;
}
@@ -108,6 +108,11 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
struct timespec now;
unsigned int ia_valid = attr->ia_valid;
+ if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) {
+ if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+ return -EPERM;
+ }
+
now = current_fs_time(inode->i_sb);
attr->ia_ctime = now;
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index dda510d31f8..b70eea1e8c5 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -59,7 +59,7 @@ static const struct super_operations autofs_sops = {
enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto};
-static match_table_t autofs_tokens = {
+static const match_table_t autofs_tokens = {
{Opt_fd, "fd=%u"},
{Opt_uid, "uid=%u"},
{Opt_gid, "gid=%u"},
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index c3d352d7fa9..69a2f5c9231 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -52,7 +52,10 @@ struct autofs_info {
int flags;
- struct list_head rehash;
+ struct completion expire_complete;
+
+ struct list_head active;
+ struct list_head expiring;
struct autofs_sb_info *sbi;
unsigned long last_used;
@@ -68,15 +71,14 @@ struct autofs_info {
};
#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */
+#define AUTOFS_INF_MOUNTPOINT (1<<1) /* mountpoint status for direct expire */
struct autofs_wait_queue {
wait_queue_head_t queue;
struct autofs_wait_queue *next;
autofs_wqt_t wait_queue_token;
/* We use the following to see what we are waiting for */
- unsigned int hash;
- unsigned int len;
- char *name;
+ struct qstr name;
u32 dev;
u64 ino;
uid_t uid;
@@ -85,7 +87,7 @@ struct autofs_wait_queue {
pid_t tgid;
/* This is for status reporting upon return */
int status;
- atomic_t wait_ctr;
+ unsigned int wait_ctr;
};
#define AUTOFS_SBI_MAGIC 0x6d4a556d
@@ -112,8 +114,9 @@ struct autofs_sb_info {
struct mutex wq_mutex;
spinlock_t fs_lock;
struct autofs_wait_queue *queues; /* Wait queue pointer */
- spinlock_t rehash_lock;
- struct list_head rehash_list;
+ spinlock_t lookup_lock;
+ struct list_head active_list;
+ struct list_head expiring_list;
};
static inline struct autofs_sb_info *autofs4_sbi(struct super_block *sb)
@@ -138,18 +141,14 @@ static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) {
static inline int autofs4_ispending(struct dentry *dentry)
{
struct autofs_info *inf = autofs4_dentry_ino(dentry);
- int pending = 0;
if (dentry->d_flags & DCACHE_AUTOFS_PENDING)
return 1;
- if (inf) {
- spin_lock(&inf->sbi->fs_lock);
- pending = inf->flags & AUTOFS_INF_EXPIRING;
- spin_unlock(&inf->sbi->fs_lock);
- }
+ if (inf->flags & AUTOFS_INF_EXPIRING)
+ return 1;
- return pending;
+ return 0;
}
static inline void autofs4_copy_atime(struct file *src, struct file *dst)
@@ -164,6 +163,7 @@ void autofs4_free_ino(struct autofs_info *);
/* Expiration */
int is_autofs4_dentry(struct dentry *);
+int autofs4_expire_wait(struct dentry *dentry);
int autofs4_expire_run(struct super_block *, struct vfsmount *,
struct autofs_sb_info *,
struct autofs_packet_expire __user *);
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 894fee54d4d..cdabb796ff0 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -259,13 +259,15 @@ static struct dentry *autofs4_expire_direct(struct super_block *sb,
now = jiffies;
timeout = sbi->exp_timeout;
- /* Lock the tree as we must expire as a whole */
spin_lock(&sbi->fs_lock);
if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
struct autofs_info *ino = autofs4_dentry_ino(root);
-
- /* Set this flag early to catch sys_chdir and the like */
+ if (d_mountpoint(root)) {
+ ino->flags |= AUTOFS_INF_MOUNTPOINT;
+ root->d_mounted--;
+ }
ino->flags |= AUTOFS_INF_EXPIRING;
+ init_completion(&ino->expire_complete);
spin_unlock(&sbi->fs_lock);
return root;
}
@@ -292,6 +294,8 @@ static struct dentry *autofs4_expire_indirect(struct super_block *sb,
struct list_head *next;
int do_now = how & AUTOFS_EXP_IMMEDIATE;
int exp_leaves = how & AUTOFS_EXP_LEAVES;
+ struct autofs_info *ino;
+ unsigned int ino_count;
if (!root)
return NULL;
@@ -316,6 +320,9 @@ static struct dentry *autofs4_expire_indirect(struct super_block *sb,
dentry = dget(dentry);
spin_unlock(&dcache_lock);
+ spin_lock(&sbi->fs_lock);
+ ino = autofs4_dentry_ino(dentry);
+
/*
* Case 1: (i) indirect mount or top level pseudo direct mount
* (autofs-4.1).
@@ -326,6 +333,11 @@ static struct dentry *autofs4_expire_indirect(struct super_block *sb,
DPRINTK("checking mountpoint %p %.*s",
dentry, (int)dentry->d_name.len, dentry->d_name.name);
+ /* Path walk currently on this dentry? */
+ ino_count = atomic_read(&ino->count) + 2;
+ if (atomic_read(&dentry->d_count) > ino_count)
+ goto next;
+
/* Can we umount this guy */
if (autofs4_mount_busy(mnt, dentry))
goto next;
@@ -343,23 +355,25 @@ static struct dentry *autofs4_expire_indirect(struct super_block *sb,
/* Case 2: tree mount, expire iff entire tree is not busy */
if (!exp_leaves) {
- /* Lock the tree as we must expire as a whole */
- spin_lock(&sbi->fs_lock);
- if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) {
- struct autofs_info *inf = autofs4_dentry_ino(dentry);
+ /* Path walk currently on this dentry? */
+ ino_count = atomic_read(&ino->count) + 1;
+ if (atomic_read(&dentry->d_count) > ino_count)
+ goto next;
- /* Set this flag early to catch sys_chdir and the like */
- inf->flags |= AUTOFS_INF_EXPIRING;
- spin_unlock(&sbi->fs_lock);
+ if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) {
expired = dentry;
goto found;
}
- spin_unlock(&sbi->fs_lock);
/*
* Case 3: pseudo direct mount, expire individual leaves
* (autofs-4.1).
*/
} else {
+ /* Path walk currently on this dentry? */
+ ino_count = atomic_read(&ino->count) + 1;
+ if (atomic_read(&dentry->d_count) > ino_count)
+ goto next;
+
expired = autofs4_check_leaves(mnt, dentry, timeout, do_now);
if (expired) {
dput(dentry);
@@ -367,6 +381,7 @@ static struct dentry *autofs4_expire_indirect(struct super_block *sb,
}
}
next:
+ spin_unlock(&sbi->fs_lock);
dput(dentry);
spin_lock(&dcache_lock);
next = next->next;
@@ -377,12 +392,45 @@ next:
found:
DPRINTK("returning %p %.*s",
expired, (int)expired->d_name.len, expired->d_name.name);
+ ino = autofs4_dentry_ino(expired);
+ ino->flags |= AUTOFS_INF_EXPIRING;
+ init_completion(&ino->expire_complete);
+ spin_unlock(&sbi->fs_lock);
spin_lock(&dcache_lock);
list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
spin_unlock(&dcache_lock);
return expired;
}
+int autofs4_expire_wait(struct dentry *dentry)
+{
+ struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+ struct autofs_info *ino = autofs4_dentry_ino(dentry);
+ int status;
+
+ /* Block on any pending expire */
+ spin_lock(&sbi->fs_lock);
+ if (ino->flags & AUTOFS_INF_EXPIRING) {
+ spin_unlock(&sbi->fs_lock);
+
+ DPRINTK("waiting for expire %p name=%.*s",
+ dentry, dentry->d_name.len, dentry->d_name.name);
+
+ status = autofs4_wait(sbi, dentry, NFY_NONE);
+ wait_for_completion(&ino->expire_complete);
+
+ DPRINTK("expire done status=%d", status);
+
+ if (d_unhashed(dentry))
+ return -EAGAIN;
+
+ return status;
+ }
+ spin_unlock(&sbi->fs_lock);
+
+ return 0;
+}
+
/* Perform an expiry operation */
int autofs4_expire_run(struct super_block *sb,
struct vfsmount *mnt,
@@ -390,7 +438,9 @@ int autofs4_expire_run(struct super_block *sb,
struct autofs_packet_expire __user *pkt_p)
{
struct autofs_packet_expire pkt;
+ struct autofs_info *ino;
struct dentry *dentry;
+ int ret = 0;
memset(&pkt,0,sizeof pkt);
@@ -406,9 +456,15 @@ int autofs4_expire_run(struct super_block *sb,
dput(dentry);
if ( copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)) )
- return -EFAULT;
+ ret = -EFAULT;
- return 0;
+ spin_lock(&sbi->fs_lock);
+ ino = autofs4_dentry_ino(dentry);
+ ino->flags &= ~AUTOFS_INF_EXPIRING;
+ complete_all(&ino->expire_complete);
+ spin_unlock(&sbi->fs_lock);
+
+ return ret;
}
/* Call repeatedly until it returns -EAGAIN, meaning there's nothing
@@ -433,9 +489,16 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
/* This is synchronous because it makes the daemon a
little easier */
- ino->flags |= AUTOFS_INF_EXPIRING;
ret = autofs4_wait(sbi, dentry, NFY_EXPIRE);
+
+ spin_lock(&sbi->fs_lock);
+ if (ino->flags & AUTOFS_INF_MOUNTPOINT) {
+ sb->s_root->d_mounted++;
+ ino->flags &= ~AUTOFS_INF_MOUNTPOINT;
+ }
ino->flags &= ~AUTOFS_INF_EXPIRING;
+ complete_all(&ino->expire_complete);
+ spin_unlock(&sbi->fs_lock);
dput(dentry);
}
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 2fdcf5e1d23..45d55819203 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -24,8 +24,10 @@
static void ino_lnkfree(struct autofs_info *ino)
{
- kfree(ino->u.symlink);
- ino->u.symlink = NULL;
+ if (ino->u.symlink) {
+ kfree(ino->u.symlink);
+ ino->u.symlink = NULL;
+ }
}
struct autofs_info *autofs4_init_ino(struct autofs_info *ino,
@@ -41,16 +43,18 @@ struct autofs_info *autofs4_init_ino(struct autofs_info *ino,
if (ino == NULL)
return NULL;
- ino->flags = 0;
- ino->mode = mode;
- ino->inode = NULL;
- ino->dentry = NULL;
- ino->size = 0;
-
- INIT_LIST_HEAD(&ino->rehash);
+ if (!reinit) {
+ ino->flags = 0;
+ ino->inode = NULL;
+ ino->dentry = NULL;
+ ino->size = 0;
+ INIT_LIST_HEAD(&ino->active);
+ INIT_LIST_HEAD(&ino->expiring);
+ atomic_set(&ino->count, 0);
+ }
+ ino->mode = mode;
ino->last_used = jiffies;
- atomic_set(&ino->count, 0);
ino->sbi = sbi;
@@ -159,8 +163,8 @@ void autofs4_kill_sb(struct super_block *sb)
if (!sbi)
goto out_kill_sb;
- if (!sbi->catatonic)
- autofs4_catatonic_mode(sbi); /* Free wait queues, close pipe */
+ /* Free wait queues, close pipe */
+ autofs4_catatonic_mode(sbi);
/* Clean up and release dangling references */
autofs4_force_release(sbi);
@@ -209,7 +213,7 @@ static const struct super_operations autofs4_sops = {
enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto,
Opt_indirect, Opt_direct, Opt_offset};
-static match_table_t tokens = {
+static const match_table_t tokens = {
{Opt_fd, "fd=%u"},
{Opt_uid, "uid=%u"},
{Opt_gid, "gid=%u"},
@@ -338,8 +342,9 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
mutex_init(&sbi->wq_mutex);
spin_lock_init(&sbi->fs_lock);
sbi->queues = NULL;
- spin_lock_init(&sbi->rehash_lock);
- INIT_LIST_HEAD(&sbi->rehash_list);
+ spin_lock_init(&sbi->lookup_lock);
+ INIT_LIST_HEAD(&sbi->active_list);
+ INIT_LIST_HEAD(&sbi->expiring_list);
s->s_blocksize = 1024;
s->s_blocksize_bits = 10;
s->s_magic = AUTOFS_SUPER_MAGIC;
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index edf5b6bddb5..2a41c2a7fc5 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -25,25 +25,27 @@ static int autofs4_dir_rmdir(struct inode *,struct dentry *);
static int autofs4_dir_mkdir(struct inode *,struct dentry *,int);
static int autofs4_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long);
static int autofs4_dir_open(struct inode *inode, struct file *file);
-static int autofs4_dir_close(struct inode *inode, struct file *file);
-static int autofs4_dir_readdir(struct file * filp, void * dirent, filldir_t filldir);
-static int autofs4_root_readdir(struct file * filp, void * dirent, filldir_t filldir);
static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
static void *autofs4_follow_link(struct dentry *, struct nameidata *);
+#define TRIGGER_FLAGS (LOOKUP_CONTINUE | LOOKUP_DIRECTORY)
+#define TRIGGER_INTENTS (LOOKUP_OPEN | LOOKUP_CREATE)
+
const struct file_operations autofs4_root_operations = {
.open = dcache_dir_open,
.release = dcache_dir_close,
.read = generic_read_dir,
- .readdir = autofs4_root_readdir,
+ .readdir = dcache_readdir,
+ .llseek = dcache_dir_lseek,
.ioctl = autofs4_root_ioctl,
};
const struct file_operations autofs4_dir_operations = {
.open = autofs4_dir_open,
- .release = autofs4_dir_close,
+ .release = dcache_dir_close,
.read = generic_read_dir,
- .readdir = autofs4_dir_readdir,
+ .readdir = dcache_readdir,
+ .llseek = dcache_dir_lseek,
};
const struct inode_operations autofs4_indirect_root_inode_operations = {
@@ -70,42 +72,10 @@ const struct inode_operations autofs4_dir_inode_operations = {
.rmdir = autofs4_dir_rmdir,
};
-static int autofs4_root_readdir(struct file *file, void *dirent,
- filldir_t filldir)
-{
- struct autofs_sb_info *sbi = autofs4_sbi(file->f_path.dentry->d_sb);
- int oz_mode = autofs4_oz_mode(sbi);
-
- DPRINTK("called, filp->f_pos = %lld", file->f_pos);
-
- /*
- * Don't set reghost flag if:
- * 1) f_pos is larger than zero -- we've already been here.
- * 2) we haven't even enabled reghosting in the 1st place.
- * 3) this is the daemon doing a readdir
- */
- if (oz_mode && file->f_pos == 0 && sbi->reghost_enabled)
- sbi->needs_reghost = 1;
-
- DPRINTK("needs_reghost = %d", sbi->needs_reghost);
-
- return dcache_readdir(file, dirent, filldir);
-}
-
static int autofs4_dir_open(struct inode *inode, struct file *file)
{
struct dentry *dentry = file->f_path.dentry;
- struct vfsmount *mnt = file->f_path.mnt;
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
- struct dentry *cursor;
- int status;
-
- status = dcache_dir_open(inode, file);
- if (status)
- goto out;
-
- cursor = file->private_data;
- cursor->d_fsdata = NULL;
DPRINTK("file=%p dentry=%p %.*s",
file, dentry, dentry->d_name.len, dentry->d_name.name);
@@ -113,159 +83,32 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
if (autofs4_oz_mode(sbi))
goto out;
- if (autofs4_ispending(dentry)) {
- DPRINTK("dentry busy");
- dcache_dir_close(inode, file);
- status = -EBUSY;
- goto out;
- }
-
- status = -ENOENT;
- if (!d_mountpoint(dentry) && dentry->d_op && dentry->d_op->d_revalidate) {
- struct nameidata nd;
- int empty, ret;
-
- /* In case there are stale directory dentrys from a failed mount */
- spin_lock(&dcache_lock);
- empty = list_empty(&dentry->d_subdirs);
+ /*
+ * An empty directory in an autofs file system is always a
+ * mount point. The daemon must have failed to mount this
+ * during lookup so it doesn't exist. This can happen, for
+ * example, if user space returns an incorrect status for a
+ * mount request. Otherwise we're doing a readdir on the
+ * autofs file system so just let the libfs routines handle
+ * it.
+ */
+ spin_lock(&dcache_lock);
+ if (!d_mountpoint(dentry) && __simple_empty(dentry)) {
spin_unlock(&dcache_lock);
-
- if (!empty)
- d_invalidate(dentry);
-
- nd.flags = LOOKUP_DIRECTORY;
- ret = (dentry->d_op->d_revalidate)(dentry, &nd);
-
- if (ret <= 0) {
- if (ret < 0)
- status = ret;
- dcache_dir_close(inode, file);
- goto out;
- }
+ return -ENOENT;
}
+ spin_unlock(&dcache_lock);
- if (d_mountpoint(dentry)) {
- struct file *fp = NULL;
- struct path fp_path = { .dentry = dentry, .mnt = mnt };
-
- path_get(&fp_path);
-
- if (!autofs4_follow_mount(&fp_path.mnt, &fp_path.dentry)) {
- path_put(&fp_path);
- dcache_dir_close(inode, file);
- goto out;
- }
-
- fp = dentry_open(fp_path.dentry, fp_path.mnt, file->f_flags);
- status = PTR_ERR(fp);
- if (IS_ERR(fp)) {
- dcache_dir_close(inode, file);
- goto out;
- }
- cursor->d_fsdata = fp;
- }
- return 0;
-out:
- return status;
-}
-
-static int autofs4_dir_close(struct inode *inode, struct file *file)
-{
- struct dentry *dentry = file->f_path.dentry;
- struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
- struct dentry *cursor = file->private_data;
- int status = 0;
-
- DPRINTK("file=%p dentry=%p %.*s",
- file, dentry, dentry->d_name.len, dentry->d_name.name);
-
- if (autofs4_oz_mode(sbi))
- goto out;
-
- if (autofs4_ispending(dentry)) {
- DPRINTK("dentry busy");
- status = -EBUSY;
- goto out;
- }
-
- if (d_mountpoint(dentry)) {
- struct file *fp = cursor->d_fsdata;
- if (!fp) {
- status = -ENOENT;
- goto out;
- }
- filp_close(fp, current->files);
- }
-out:
- dcache_dir_close(inode, file);
- return status;
-}
-
-static int autofs4_dir_readdir(struct file *file, void *dirent, filldir_t filldir)
-{
- struct dentry *dentry = file->f_path.dentry;
- struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
- struct dentry *cursor = file->private_data;
- int status;
-
- DPRINTK("file=%p dentry=%p %.*s",
- file, dentry, dentry->d_name.len, dentry->d_name.name);
-
- if (autofs4_oz_mode(sbi))
- goto out;
-
- if (autofs4_ispending(dentry)) {
- DPRINTK("dentry busy");
- return -EBUSY;
- }
-
- if (d_mountpoint(dentry)) {
- struct file *fp = cursor->d_fsdata;
-
- if (!fp)
- return -ENOENT;
-
- if (!fp->f_op || !fp->f_op->readdir)
- goto out;
-
- status = vfs_readdir(fp, filldir, dirent);
- file->f_pos = fp->f_pos;
- if (status)
- autofs4_copy_atime(file, fp);
- return status;
- }
out:
- return dcache_readdir(file, dirent, filldir);
+ return dcache_dir_open(inode, file);
}
static int try_to_fill_dentry(struct dentry *dentry, int flags)
{
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
- struct dentry *new;
int status;
- /* Block on any pending expiry here; invalidate the dentry
- when expiration is done to trigger mount request with a new
- dentry */
- if (ino && (ino->flags & AUTOFS_INF_EXPIRING)) {
- DPRINTK("waiting for expire %p name=%.*s",
- dentry, dentry->d_name.len, dentry->d_name.name);
-
- status = autofs4_wait(sbi, dentry, NFY_NONE);
-
- DPRINTK("expire done status=%d", status);
-
- /*
- * If the directory still exists the mount request must
- * continue otherwise it can't be followed at the right
- * time during the walk.
- */
- status = d_invalidate(dentry);
- if (status != -EBUSY)
- return -EAGAIN;
- }
-
DPRINTK("dentry=%p %.*s ino=%p",
dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
@@ -292,7 +135,8 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags)
return status;
}
/* Trigger mount for path component or follow link */
- } else if (flags & (LOOKUP_CONTINUE | LOOKUP_DIRECTORY) ||
+ } else if (dentry->d_flags & DCACHE_AUTOFS_PENDING ||
+ flags & (TRIGGER_FLAGS | TRIGGER_INTENTS) ||
current->link_count) {
DPRINTK("waiting for mount name=%.*s",
dentry->d_name.len, dentry->d_name.name);
@@ -320,26 +164,6 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags)
dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
spin_unlock(&dentry->d_lock);
- /*
- * The dentry that is passed in from lookup may not be the one
- * we end up using, as mkdir can create a new one. If this
- * happens, and another process tries the lookup at the same time,
- * it will set the PENDING flag on this new dentry, but add itself
- * to our waitq. Then, if after the lookup succeeds, the first
- * process that requested the mount performs another lookup of the
- * same directory, it will show up as still pending! So, we need
- * to redo the lookup here and clear pending on that dentry.
- */
- if (d_unhashed(dentry)) {
- new = d_lookup(dentry->d_parent, &dentry->d_name);
- if (new) {
- spin_lock(&new->d_lock);
- new->d_flags &= ~DCACHE_AUTOFS_PENDING;
- spin_unlock(&new->d_lock);
- dput(new);
- }
- }
-
return 0;
}
@@ -355,51 +179,63 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
DPRINTK("dentry=%p %.*s oz_mode=%d nd->flags=%d",
dentry, dentry->d_name.len, dentry->d_name.name, oz_mode,
nd->flags);
-
- /* If it's our master or we shouldn't trigger a mount we're done */
- lookup_type = nd->flags & (LOOKUP_CONTINUE | LOOKUP_DIRECTORY);
- if (oz_mode || !lookup_type)
+ /*
+ * For an expire of a covered direct or offset mount we need
+ * to beeak out of follow_down() at the autofs mount trigger
+ * (d_mounted--), so we can see the expiring flag, and manage
+ * the blocking and following here until the expire is completed.
+ */
+ if (oz_mode) {
+ spin_lock(&sbi->fs_lock);
+ if (ino->flags & AUTOFS_INF_EXPIRING) {
+ spin_unlock(&sbi->fs_lock);
+ /* Follow down to our covering mount. */
+ if (!follow_down(&nd->path.mnt, &nd->path.dentry))
+ goto done;
+ goto follow;
+ }
+ spin_unlock(&sbi->fs_lock);
goto done;
+ }
- /* If an expire request is pending wait for it. */
- if (ino && (ino->flags & AUTOFS_INF_EXPIRING)) {
- DPRINTK("waiting for active request %p name=%.*s",
- dentry, dentry->d_name.len, dentry->d_name.name);
-
- status = autofs4_wait(sbi, dentry, NFY_NONE);
+ /* If an expire request is pending everyone must wait. */
+ autofs4_expire_wait(dentry);
- DPRINTK("request done status=%d", status);
- }
+ /* We trigger a mount for almost all flags */
+ lookup_type = nd->flags & (TRIGGER_FLAGS | TRIGGER_INTENTS);
+ if (!(lookup_type || dentry->d_flags & DCACHE_AUTOFS_PENDING))
+ goto follow;
/*
- * If the dentry contains directories then it is an
- * autofs multi-mount with no root mount offset. So
- * don't try to mount it again.
+ * If the dentry contains directories then it is an autofs
+ * multi-mount with no root mount offset. So don't try to
+ * mount it again.
*/
spin_lock(&dcache_lock);
- if (!d_mountpoint(dentry) && __simple_empty(dentry)) {
+ if (dentry->d_flags & DCACHE_AUTOFS_PENDING ||
+ (!d_mountpoint(dentry) && __simple_empty(dentry))) {
spin_unlock(&dcache_lock);
status = try_to_fill_dentry(dentry, 0);
if (status)
goto out_error;
- /*
- * The mount succeeded but if there is no root mount
- * it must be an autofs multi-mount with no root offset
- * so we don't need to follow the mount.
- */
- if (d_mountpoint(dentry)) {
- if (!autofs4_follow_mount(&nd->path.mnt,
- &nd->path.dentry)) {
- status = -ENOENT;
- goto out_error;
- }
- }
-
- goto done;
+ goto follow;
}
spin_unlock(&dcache_lock);
+follow:
+ /*
+ * If there is no root mount it must be an autofs
+ * multi-mount with no root offset so we don't need
+ * to follow it.
+ */
+ if (d_mountpoint(dentry)) {
+ if (!autofs4_follow_mount(&nd->path.mnt,
+ &nd->path.dentry)) {
+ status = -ENOENT;
+ goto out_error;
+ }
+ }
done:
return NULL;
@@ -424,12 +260,23 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
int status = 1;
/* Pending dentry */
+ spin_lock(&sbi->fs_lock);
if (autofs4_ispending(dentry)) {
/* The daemon never causes a mount to trigger */
+ spin_unlock(&sbi->fs_lock);
+
if (oz_mode)
return 1;
/*
+ * If the directory has gone away due to an expire
+ * we have been called as ->d_revalidate() and so
+ * we need to return false and proceed to ->lookup().
+ */
+ if (autofs4_expire_wait(dentry) == -EAGAIN)
+ return 0;
+
+ /*
* A zero status is success otherwise we have a
* negative error code.
*/
@@ -437,17 +284,9 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
if (status == 0)
return 1;
- /*
- * A status of EAGAIN here means that the dentry has gone
- * away while waiting for an expire to complete. If we are
- * racing with expire lookup will wait for it so this must
- * be a revalidate and we need to send it to lookup.
- */
- if (status == -EAGAIN)
- return 0;
-
return status;
}
+ spin_unlock(&sbi->fs_lock);
/* Negative dentry.. invalidate if "old" */
if (dentry->d_inode == NULL)
@@ -461,6 +300,7 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
DPRINTK("dentry=%p %.*s, emptydir",
dentry, dentry->d_name.len, dentry->d_name.name);
spin_unlock(&dcache_lock);
+
/* The daemon never causes a mount to trigger */
if (oz_mode)
return 1;
@@ -493,10 +333,12 @@ void autofs4_dentry_release(struct dentry *de)
struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb);
if (sbi) {
- spin_lock(&sbi->rehash_lock);
- if (!list_empty(&inf->rehash))
- list_del(&inf->rehash);
- spin_unlock(&sbi->rehash_lock);
+ spin_lock(&sbi->lookup_lock);
+ if (!list_empty(&inf->active))
+ list_del(&inf->active);
+ if (!list_empty(&inf->expiring))
+ list_del(&inf->expiring);
+ spin_unlock(&sbi->lookup_lock);
}
inf->dentry = NULL;
@@ -518,7 +360,7 @@ static struct dentry_operations autofs4_dentry_operations = {
.d_release = autofs4_dentry_release,
};
-static struct dentry *autofs4_lookup_unhashed(struct autofs_sb_info *sbi, struct dentry *parent, struct qstr *name)
+static struct dentry *autofs4_lookup_active(struct autofs_sb_info *sbi, struct dentry *parent, struct qstr *name)
{
unsigned int len = name->len;
unsigned int hash = name->hash;
@@ -526,14 +368,66 @@ static struct dentry *autofs4_lookup_unhashed(struct autofs_sb_info *sbi, struct
struct list_head *p, *head;
spin_lock(&dcache_lock);
- spin_lock(&sbi->rehash_lock);
- head = &sbi->rehash_list;
+ spin_lock(&sbi->lookup_lock);
+ head = &sbi->active_list;
list_for_each(p, head) {
struct autofs_info *ino;
struct dentry *dentry;
struct qstr *qstr;
- ino = list_entry(p, struct autofs_info, rehash);
+ ino = list_entry(p, struct autofs_info, active);
+ dentry = ino->dentry;
+
+ spin_lock(&dentry->d_lock);
+
+ /* Already gone? */
+ if (atomic_read(&dentry->d_count) == 0)
+ goto next;
+
+ qstr = &dentry->d_name;
+
+ if (dentry->d_name.hash != hash)
+ goto next;
+ if (dentry->d_parent != parent)
+ goto next;
+
+ if (qstr->len != len)
+ goto next;
+ if (memcmp(qstr->name, str, len))
+ goto next;
+
+ if (d_unhashed(dentry)) {
+ dget(dentry);
+ spin_unlock(&dentry->d_lock);
+ spin_unlock(&sbi->lookup_lock);
+ spin_unlock(&dcache_lock);
+ return dentry;
+ }
+next:
+ spin_unlock(&dentry->d_lock);
+ }
+ spin_unlock(&sbi->lookup_lock);
+ spin_unlock(&dcache_lock);
+
+ return NULL;
+}
+
+static struct dentry *autofs4_lookup_expiring(struct autofs_sb_info *sbi, struct dentry *parent, struct qstr *name)
+{
+ unsigned int len = name->len;
+ unsigned int hash = name->hash;
+ const unsigned char *str = name->name;
+ struct list_head *p, *head;
+
+ spin_lock(&dcache_lock);
+ spin_lock(&sbi->lookup_lock);
+ head = &sbi->expiring_list;
+ list_for_each(p, head) {
+ struct autofs_info *ino;
+ struct dentry *dentry;
+ struct qstr *qstr;
+
+ ino = list_entry(p, struct autofs_info, expiring);
dentry = ino->dentry;
spin_lock(&dentry->d_lock);
@@ -555,33 +449,16 @@ static struct dentry *autofs4_lookup_unhashed(struct autofs_sb_info *sbi, struct
goto next;
if (d_unhashed(dentry)) {
- struct inode *inode = dentry->d_inode;
-
- ino = autofs4_dentry_ino(dentry);
- list_del_init(&ino->rehash);
dget(dentry);
- /*
- * Make the rehashed dentry negative so the VFS
- * behaves as it should.
- */
- if (inode) {
- dentry->d_inode = NULL;
- list_del_init(&dentry->d_alias);
- spin_unlock(&dentry->d_lock);
- spin_unlock(&sbi->rehash_lock);
- spin_unlock(&dcache_lock);
- iput(inode);
- return dentry;
- }
spin_unlock(&dentry->d_lock);
- spin_unlock(&sbi->rehash_lock);
+ spin_unlock(&sbi->lookup_lock);
spin_unlock(&dcache_lock);
return dentry;
}
next:
spin_unlock(&dentry->d_lock);
}
- spin_unlock(&sbi->rehash_lock);
+ spin_unlock(&sbi->lookup_lock);
spin_unlock(&dcache_lock);
return NULL;
@@ -591,7 +468,8 @@ next:
static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
{
struct autofs_sb_info *sbi;
- struct dentry *unhashed;
+ struct autofs_info *ino;
+ struct dentry *expiring, *unhashed;
int oz_mode;
DPRINTK("name = %.*s",
@@ -607,8 +485,26 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode);
- unhashed = autofs4_lookup_unhashed(sbi, dentry->d_parent, &dentry->d_name);
- if (!unhashed) {
+ expiring = autofs4_lookup_expiring(sbi, dentry->d_parent, &dentry->d_name);
+ if (expiring) {
+ /*
+ * If we are racing with expire the request might not
+ * be quite complete but the directory has been removed
+ * so it must have been successful, so just wait for it.
+ */
+ ino = autofs4_dentry_ino(expiring);
+ autofs4_expire_wait(expiring);
+ spin_lock(&sbi->lookup_lock);
+ if (!list_empty(&ino->expiring))
+ list_del_init(&ino->expiring);
+ spin_unlock(&sbi->lookup_lock);
+ dput(expiring);
+ }
+
+ unhashed = autofs4_lookup_active(sbi, dentry->d_parent, &dentry->d_name);
+ if (unhashed)
+ dentry = unhashed;
+ else {
/*
* Mark the dentry incomplete but don't hash it. We do this
* to serialize our inode creation operations (symlink and
@@ -622,39 +518,34 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
*/
dentry->d_op = &autofs4_root_dentry_operations;
- dentry->d_fsdata = NULL;
- d_instantiate(dentry, NULL);
- } else {
- struct autofs_info *ino = autofs4_dentry_ino(unhashed);
- DPRINTK("rehash %p with %p", dentry, unhashed);
/*
- * If we are racing with expire the request might not
- * be quite complete but the directory has been removed
- * so it must have been successful, so just wait for it.
- * We need to ensure the AUTOFS_INF_EXPIRING flag is clear
- * before continuing as revalidate may fail when calling
- * try_to_fill_dentry (returning EAGAIN) if we don't.
+ * And we need to ensure that the same dentry is used for
+ * all following lookup calls until it is hashed so that
+ * the dentry flags are persistent throughout the request.
*/
- while (ino && (ino->flags & AUTOFS_INF_EXPIRING)) {
- DPRINTK("wait for incomplete expire %p name=%.*s",
- unhashed, unhashed->d_name.len,
- unhashed->d_name.name);
- autofs4_wait(sbi, unhashed, NFY_NONE);
- DPRINTK("request completed");
- }
- dentry = unhashed;
+ ino = autofs4_init_ino(NULL, sbi, 0555);
+ if (!ino)
+ return ERR_PTR(-ENOMEM);
+
+ dentry->d_fsdata = ino;
+ ino->dentry = dentry;
+
+ spin_lock(&sbi->lookup_lock);
+ list_add(&ino->active, &sbi->active_list);
+ spin_unlock(&sbi->lookup_lock);
+
+ d_instantiate(dentry, NULL);
}
if (!oz_mode) {
spin_lock(&dentry->d_lock);
dentry->d_flags |= DCACHE_AUTOFS_PENDING;
spin_unlock(&dentry->d_lock);
- }
-
- if (dentry->d_op && dentry->d_op->d_revalidate) {
- mutex_unlock(&dir->i_mutex);
- (dentry->d_op->d_revalidate)(dentry, nd);
- mutex_lock(&dir->i_mutex);
+ if (dentry->d_op && dentry->d_op->d_revalidate) {
+ mutex_unlock(&dir->i_mutex);
+ (dentry->d_op->d_revalidate)(dentry, nd);
+ mutex_lock(&dir->i_mutex);
+ }
}
/*
@@ -673,9 +564,11 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
return ERR_PTR(-ERESTARTNOINTR);
}
}
- spin_lock(&dentry->d_lock);
- dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
- spin_unlock(&dentry->d_lock);
+ if (!oz_mode) {
+ spin_lock(&dentry->d_lock);
+ dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
+ spin_unlock(&dentry->d_lock);
+ }
}
/*
@@ -706,7 +599,7 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
}
if (unhashed)
- return dentry;
+ return unhashed;
return NULL;
}
@@ -728,20 +621,31 @@ static int autofs4_dir_symlink(struct inode *dir,
return -EACCES;
ino = autofs4_init_ino(ino, sbi, S_IFLNK | 0555);
- if (ino == NULL)
- return -ENOSPC;
+ if (!ino)
+ return -ENOMEM;
- ino->size = strlen(symname);
- ino->u.symlink = cp = kmalloc(ino->size + 1, GFP_KERNEL);
+ spin_lock(&sbi->lookup_lock);
+ if (!list_empty(&ino->active))
+ list_del_init(&ino->active);
+ spin_unlock(&sbi->lookup_lock);
- if (cp == NULL) {
- kfree(ino);
- return -ENOSPC;
+ ino->size = strlen(symname);
+ cp = kmalloc(ino->size + 1, GFP_KERNEL);
+ if (!cp) {
+ if (!dentry->d_fsdata)
+ kfree(ino);
+ return -ENOMEM;
}
strcpy(cp, symname);
inode = autofs4_get_inode(dir->i_sb, ino);
+ if (!inode) {
+ kfree(cp);
+ if (!dentry->d_fsdata)
+ kfree(ino);
+ return -ENOMEM;
+ }
d_add(dentry, inode);
if (dir == dir->i_sb->s_root->d_inode)
@@ -757,6 +661,7 @@ static int autofs4_dir_symlink(struct inode *dir,
atomic_inc(&p_ino->count);
ino->inode = inode;
+ ino->u.symlink = cp;
dir->i_mtime = CURRENT_TIME;
return 0;
@@ -769,9 +674,8 @@ static int autofs4_dir_symlink(struct inode *dir,
* that the file no longer exists. However, doing that means that the
* VFS layer can turn the dentry into a negative dentry. We don't want
* this, because the unlink is probably the result of an expire.
- * We simply d_drop it and add it to a rehash candidates list in the
- * super block, which allows the dentry lookup to reuse it retaining
- * the flags, such as expire in progress, in case we're racing with expire.
+ * We simply d_drop it and add it to a expiring list in the super block,
+ * which allows the dentry lookup to check for an incomplete expire.
*
* If a process is blocked on the dentry waiting for the expire to finish,
* it will invalidate the dentry and try to mount with a new one.
@@ -801,9 +705,10 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
dir->i_mtime = CURRENT_TIME;
spin_lock(&dcache_lock);
- spin_lock(&sbi->rehash_lock);
- list_add(&ino->rehash, &sbi->rehash_list);
- spin_unlock(&sbi->rehash_lock);
+ spin_lock(&sbi->lookup_lock);
+ if (list_empty(&ino->expiring))
+ list_add(&ino->expiring, &sbi->expiring_list);
+ spin_unlock(&sbi->lookup_lock);
spin_lock(&dentry->d_lock);
__d_drop(dentry);
spin_unlock(&dentry->d_lock);
@@ -829,9 +734,10 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
spin_unlock(&dcache_lock);
return -ENOTEMPTY;
}
- spin_lock(&sbi->rehash_lock);
- list_add(&ino->rehash, &sbi->rehash_list);
- spin_unlock(&sbi->rehash_lock);
+ spin_lock(&sbi->lookup_lock);
+ if (list_empty(&ino->expiring))
+ list_add(&ino->expiring, &sbi->expiring_list);
+ spin_unlock(&sbi->lookup_lock);
spin_lock(&dentry->d_lock);
__d_drop(dentry);
spin_unlock(&dentry->d_lock);
@@ -866,10 +772,20 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
dentry, dentry->d_name.len, dentry->d_name.name);
ino = autofs4_init_ino(ino, sbi, S_IFDIR | 0555);
- if (ino == NULL)
- return -ENOSPC;
+ if (!ino)
+ return -ENOMEM;
+
+ spin_lock(&sbi->lookup_lock);
+ if (!list_empty(&ino->active))
+ list_del_init(&ino->active);
+ spin_unlock(&sbi->lookup_lock);
inode = autofs4_get_inode(dir->i_sb, ino);
+ if (!inode) {
+ if (!dentry->d_fsdata)
+ kfree(ino);
+ return -ENOMEM;
+ }
d_add(dentry, inode);
if (dir == dir->i_sb->s_root->d_inode)
@@ -922,44 +838,6 @@ static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi, int __user
}
/*
- * Tells the daemon whether we need to reghost or not. Also, clears
- * the reghost_needed flag.
- */
-static inline int autofs4_ask_reghost(struct autofs_sb_info *sbi, int __user *p)
-{
- int status;
-
- DPRINTK("returning %d", sbi->needs_reghost);
-
- status = put_user(sbi->needs_reghost, p);
- if (status)
- return status;
-
- sbi->needs_reghost = 0;
- return 0;
-}
-
-/*
- * Enable / Disable reghosting ioctl() operation
- */
-static inline int autofs4_toggle_reghost(struct autofs_sb_info *sbi, int __user *p)
-{
- int status;
- int val;
-
- status = get_user(val, p);
-
- DPRINTK("reghost = %d", val);
-
- if (status)
- return status;
-
- /* turn on/off reghosting, with the val */
- sbi->reghost_enabled = val;
- return 0;
-}
-
-/*
* Tells the daemon whether it can umount the autofs mount.
*/
static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p)
@@ -1023,11 +901,6 @@ static int autofs4_root_ioctl(struct inode *inode, struct file *filp,
case AUTOFS_IOC_SETTIMEOUT:
return autofs4_get_set_timeout(sbi, p);
- case AUTOFS_IOC_TOGGLEREGHOST:
- return autofs4_toggle_reghost(sbi, p);
- case AUTOFS_IOC_ASKREGHOST:
- return autofs4_ask_reghost(sbi, p);
-
case AUTOFS_IOC_ASKUMOUNT:
return autofs4_ask_umount(filp->f_path.mnt, p);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 75e5955c3f6..35216d18d8b 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -28,6 +28,12 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi)
{
struct autofs_wait_queue *wq, *nwq;
+ mutex_lock(&sbi->wq_mutex);
+ if (sbi->catatonic) {
+ mutex_unlock(&sbi->wq_mutex);
+ return;
+ }
+
DPRINTK("entering catatonic mode");
sbi->catatonic = 1;
@@ -36,13 +42,18 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi)
while (wq) {
nwq = wq->next;
wq->status = -ENOENT; /* Magic is gone - report failure */
- kfree(wq->name);
- wq->name = NULL;
+ if (wq->name.name) {
+ kfree(wq->name.name);
+ wq->name.name = NULL;
+ }
+ wq->wait_ctr--;
wake_up_interruptible(&wq->queue);
wq = nwq;
}
fput(sbi->pipe); /* Close the pipe */
sbi->pipe = NULL;
+ sbi->pipefd = -1;
+ mutex_unlock(&sbi->wq_mutex);
}
static int autofs4_write(struct file *file, const void *addr, int bytes)
@@ -89,10 +100,11 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
union autofs_packet_union v4_pkt;
union autofs_v5_packet_union v5_pkt;
} pkt;
+ struct file *pipe = NULL;
size_t pktsz;
DPRINTK("wait id = 0x%08lx, name = %.*s, type=%d",
- wq->wait_queue_token, wq->len, wq->name, type);
+ wq->wait_queue_token, wq->name.len, wq->name.name, type);
memset(&pkt,0,sizeof pkt); /* For security reasons */
@@ -107,9 +119,9 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
pktsz = sizeof(*mp);
mp->wait_queue_token = wq->wait_queue_token;
- mp->len = wq->len;
- memcpy(mp->name, wq->name, wq->len);
- mp->name[wq->len] = '\0';
+ mp->len = wq->name.len;
+ memcpy(mp->name, wq->name.name, wq->name.len);
+ mp->name[wq->name.len] = '\0';
break;
}
case autofs_ptype_expire_multi:
@@ -119,9 +131,9 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
pktsz = sizeof(*ep);
ep->wait_queue_token = wq->wait_queue_token;
- ep->len = wq->len;
- memcpy(ep->name, wq->name, wq->len);
- ep->name[wq->len] = '\0';
+ ep->len = wq->name.len;
+ memcpy(ep->name, wq->name.name, wq->name.len);
+ ep->name[wq->name.len] = '\0';
break;
}
/*
@@ -138,9 +150,9 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
pktsz = sizeof(*packet);
packet->wait_queue_token = wq->wait_queue_token;
- packet->len = wq->len;
- memcpy(packet->name, wq->name, wq->len);
- packet->name[wq->len] = '\0';
+ packet->len = wq->name.len;
+ memcpy(packet->name, wq->name.name, wq->name.len);
+ packet->name[wq->name.len] = '\0';
packet->dev = wq->dev;
packet->ino = wq->ino;
packet->uid = wq->uid;
@@ -154,8 +166,19 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
return;
}
- if (autofs4_write(sbi->pipe, &pkt, pktsz))
- autofs4_catatonic_mode(sbi);
+ /* Check if we have become catatonic */
+ mutex_lock(&sbi->wq_mutex);
+ if (!sbi->catatonic) {
+ pipe = sbi->pipe;
+ get_file(pipe);
+ }
+ mutex_unlock(&sbi->wq_mutex);
+
+ if (pipe) {
+ if (autofs4_write(pipe, &pkt, pktsz))
+ autofs4_catatonic_mode(sbi);
+ fput(pipe);
+ }
}
static int autofs4_getpath(struct autofs_sb_info *sbi,
@@ -191,58 +214,55 @@ static int autofs4_getpath(struct autofs_sb_info *sbi,
}
static struct autofs_wait_queue *
-autofs4_find_wait(struct autofs_sb_info *sbi,
- char *name, unsigned int hash, unsigned int len)
+autofs4_find_wait(struct autofs_sb_info *sbi, struct qstr *qstr)
{
struct autofs_wait_queue *wq;
for (wq = sbi->queues; wq; wq = wq->next) {
- if (wq->hash == hash &&
- wq->len == len &&
- wq->name && !memcmp(wq->name, name, len))
+ if (wq->name.hash == qstr->hash &&
+ wq->name.len == qstr->len &&
+ wq->name.name &&
+ !memcmp(wq->name.name, qstr->name, qstr->len))
break;
}
return wq;
}
-int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
- enum autofs_notify notify)
+/*
+ * Check if we have a valid request.
+ * Returns
+ * 1 if the request should continue.
+ * In this case we can return an autofs_wait_queue entry if one is
+ * found or NULL to idicate a new wait needs to be created.
+ * 0 or a negative errno if the request shouldn't continue.
+ */
+static int validate_request(struct autofs_wait_queue **wait,
+ struct autofs_sb_info *sbi,
+ struct qstr *qstr,
+ struct dentry*dentry, enum autofs_notify notify)
{
- struct autofs_info *ino;
struct autofs_wait_queue *wq;
- char *name;
- unsigned int len = 0;
- unsigned int hash = 0;
- int status, type;
-
- /* In catatonic mode, we don't wait for nobody */
- if (sbi->catatonic)
- return -ENOENT;
-
- name = kmalloc(NAME_MAX + 1, GFP_KERNEL);
- if (!name)
- return -ENOMEM;
+ struct autofs_info *ino;
- /* If this is a direct mount request create a dummy name */
- if (IS_ROOT(dentry) && (sbi->type & AUTOFS_TYPE_DIRECT))
- len = sprintf(name, "%p", dentry);
- else {
- len = autofs4_getpath(sbi, dentry, &name);
- if (!len) {
- kfree(name);
- return -ENOENT;
- }
+ /* Wait in progress, continue; */
+ wq = autofs4_find_wait(sbi, qstr);
+ if (wq) {
+ *wait = wq;
+ return 1;
}
- hash = full_name_hash(name, len);
- if (mutex_lock_interruptible(&sbi->wq_mutex)) {
- kfree(name);
- return -EINTR;
- }
+ *wait = NULL;
- wq = autofs4_find_wait(sbi, name, hash, len);
+ /* If we don't yet have any info this is a new request */
ino = autofs4_dentry_ino(dentry);
- if (!wq && ino && notify == NFY_NONE) {
+ if (!ino)
+ return 1;
+
+ /*
+ * If we've been asked to wait on an existing expire (NFY_NONE)
+ * but there is no wait in the queue ...
+ */
+ if (notify == NFY_NONE) {
/*
* Either we've betean the pending expire to post it's
* wait or it finished while we waited on the mutex.
@@ -253,13 +273,14 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
while (ino->flags & AUTOFS_INF_EXPIRING) {
mutex_unlock(&sbi->wq_mutex);
schedule_timeout_interruptible(HZ/10);
- if (mutex_lock_interruptible(&sbi->wq_mutex)) {
- kfree(name);
+ if (mutex_lock_interruptible(&sbi->wq_mutex))
return -EINTR;
+
+ wq = autofs4_find_wait(sbi, qstr);
+ if (wq) {
+ *wait = wq;
+ return 1;
}
- wq = autofs4_find_wait(sbi, name, hash, len);
- if (wq)
- break;
}
/*
@@ -267,18 +288,96 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
* cases where we wait on NFY_NONE neither depend on the
* return status of the wait.
*/
- if (!wq) {
+ return 0;
+ }
+
+ /*
+ * If we've been asked to trigger a mount and the request
+ * completed while we waited on the mutex ...
+ */
+ if (notify == NFY_MOUNT) {
+ /*
+ * If the dentry isn't hashed just go ahead and try the
+ * mount again with a new wait (not much else we can do).
+ */
+ if (!d_unhashed(dentry)) {
+ /*
+ * But if the dentry is hashed, that means that we
+ * got here through the revalidate path. Thus, we
+ * need to check if the dentry has been mounted
+ * while we waited on the wq_mutex. If it has,
+ * simply return success.
+ */
+ if (d_mountpoint(dentry))
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
+ enum autofs_notify notify)
+{
+ struct autofs_wait_queue *wq;
+ struct qstr qstr;
+ char *name;
+ int status, ret, type;
+
+ /* In catatonic mode, we don't wait for nobody */
+ if (sbi->catatonic)
+ return -ENOENT;
+
+ if (!dentry->d_inode) {
+ /*
+ * A wait for a negative dentry is invalid for certain
+ * cases. A direct or offset mount "always" has its mount
+ * point directory created and so the request dentry must
+ * be positive or the map key doesn't exist. The situation
+ * is very similar for indirect mounts except only dentrys
+ * in the root of the autofs file system may be negative.
+ */
+ if (sbi->type & (AUTOFS_TYPE_DIRECT|AUTOFS_TYPE_OFFSET))
+ return -ENOENT;
+ else if (!IS_ROOT(dentry->d_parent))
+ return -ENOENT;
+ }
+
+ name = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+ if (!name)
+ return -ENOMEM;
+
+ /* If this is a direct mount request create a dummy name */
+ if (IS_ROOT(dentry) && (sbi->type & AUTOFS_TYPE_DIRECT))
+ qstr.len = sprintf(name, "%p", dentry);
+ else {
+ qstr.len = autofs4_getpath(sbi, dentry, &name);
+ if (!qstr.len) {
kfree(name);
- mutex_unlock(&sbi->wq_mutex);
- return 0;
+ return -ENOENT;
}
}
+ qstr.name = name;
+ qstr.hash = full_name_hash(name, qstr.len);
+
+ if (mutex_lock_interruptible(&sbi->wq_mutex)) {
+ kfree(qstr.name);
+ return -EINTR;
+ }
+
+ ret = validate_request(&wq, sbi, &qstr, dentry, notify);
+ if (ret <= 0) {
+ if (ret == 0)
+ mutex_unlock(&sbi->wq_mutex);
+ kfree(qstr.name);
+ return ret;
+ }
if (!wq) {
/* Create a new wait queue */
wq = kmalloc(sizeof(struct autofs_wait_queue),GFP_KERNEL);
if (!wq) {
- kfree(name);
+ kfree(qstr.name);
mutex_unlock(&sbi->wq_mutex);
return -ENOMEM;
}
@@ -289,9 +388,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
wq->next = sbi->queues;
sbi->queues = wq;
init_waitqueue_head(&wq->queue);
- wq->hash = hash;
- wq->name = name;
- wq->len = len;
+ memcpy(&wq->name, &qstr, sizeof(struct qstr));
wq->dev = autofs4_get_dev(sbi);
wq->ino = autofs4_get_ino(sbi);
wq->uid = current->uid;
@@ -299,7 +396,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
wq->pid = current->pid;
wq->tgid = current->tgid;
wq->status = -EINTR; /* Status return if interrupted */
- atomic_set(&wq->wait_ctr, 2);
+ wq->wait_ctr = 2;
mutex_unlock(&sbi->wq_mutex);
if (sbi->version < 5) {
@@ -319,28 +416,25 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
}
DPRINTK("new wait id = 0x%08lx, name = %.*s, nfy=%d\n",
- (unsigned long) wq->wait_queue_token, wq->len, wq->name, notify);
+ (unsigned long) wq->wait_queue_token, wq->name.len,
+ wq->name.name, notify);
/* autofs4_notify_daemon() may block */
autofs4_notify_daemon(sbi, wq, type);
} else {
- atomic_inc(&wq->wait_ctr);
+ wq->wait_ctr++;
mutex_unlock(&sbi->wq_mutex);
- kfree(name);
+ kfree(qstr.name);
DPRINTK("existing wait id = 0x%08lx, name = %.*s, nfy=%d",
- (unsigned long) wq->wait_queue_token, wq->len, wq->name, notify);
- }
-
- /* wq->name is NULL if and only if the lock is already released */
-
- if (sbi->catatonic) {
- /* We might have slept, so check again for catatonic mode */
- wq->status = -ENOENT;
- kfree(wq->name);
- wq->name = NULL;
+ (unsigned long) wq->wait_queue_token, wq->name.len,
+ wq->name.name, notify);
}
- if (wq->name) {
+ /*
+ * wq->name.name is NULL iff the lock is already released
+ * or the mount has been made catatonic.
+ */
+ if (wq->name.name) {
/* Block all but "shutdown" signals while waiting */
sigset_t oldset;
unsigned long irqflags;
@@ -351,7 +445,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
recalc_sigpending();
spin_unlock_irqrestore(&current->sighand->siglock, irqflags);
- wait_event_interruptible(wq->queue, wq->name == NULL);
+ wait_event_interruptible(wq->queue, wq->name.name == NULL);
spin_lock_irqsave(&current->sighand->siglock, irqflags);
current->blocked = oldset;
@@ -364,8 +458,10 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
status = wq->status;
/* Are we the last process to need status? */
- if (atomic_dec_and_test(&wq->wait_ctr))
+ mutex_lock(&sbi->wq_mutex);
+ if (!--wq->wait_ctr)
kfree(wq);
+ mutex_unlock(&sbi->wq_mutex);
return status;
}
@@ -387,16 +483,13 @@ int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_tok
}
*wql = wq->next; /* Unlink from chain */
- mutex_unlock(&sbi->wq_mutex);
- kfree(wq->name);
- wq->name = NULL; /* Do not wait on this queue */
-
+ kfree(wq->name.name);
+ wq->name.name = NULL; /* Do not wait on this queue */
wq->status = status;
-
- if (atomic_dec_and_test(&wq->wait_ctr)) /* Is anyone still waiting for this guy? */
+ wake_up_interruptible(&wq->queue);
+ if (!--wq->wait_ctr)
kfree(wq);
- else
- wake_up_interruptible(&wq->queue);
+ mutex_unlock(&sbi->wq_mutex);
return 0;
}
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index f1c2ea8342f..5f1538c03b1 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -243,8 +243,7 @@ static int bad_inode_readlink(struct dentry *dentry, char __user *buffer,
return -EIO;
}
-static int bad_inode_permission(struct inode *inode, int mask,
- struct nameidata *nd)
+static int bad_inode_permission(struct inode *inode, int mask)
{
return -EIO;
}
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index e8717de3bab..9286b2af893 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -66,6 +66,7 @@ static struct kmem_cache *befs_inode_cachep;
static const struct file_operations befs_dir_operations = {
.read = generic_read_dir,
.readdir = befs_readdir,
+ .llseek = generic_file_llseek,
};
static const struct inode_operations befs_dir_inode_operations = {
@@ -289,7 +290,7 @@ befs_destroy_inode(struct inode *inode)
kmem_cache_free(befs_inode_cachep, BEFS_I(inode));
}
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
{
struct befs_inode_info *bi = (struct befs_inode_info *) foo;
@@ -649,7 +650,7 @@ enum {
Opt_uid, Opt_gid, Opt_charset, Opt_debug, Opt_err,
};
-static match_table_t befs_tokens = {
+static const match_table_t befs_tokens = {
{Opt_uid, "uid=%d"},
{Opt_gid, "gid=%d"},
{Opt_charset, "iocharset=%s"},
diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h
index 70f5d3a8eed..7109e451abf 100644
--- a/fs/bfs/bfs.h
+++ b/fs/bfs/bfs.h
@@ -16,8 +16,9 @@ struct bfs_sb_info {
unsigned long si_freei;
unsigned long si_lf_eblk;
unsigned long si_lasti;
- unsigned long * si_imap;
- struct buffer_head * si_sbh; /* buffer header w/superblock */
+ unsigned long *si_imap;
+ struct buffer_head *si_sbh; /* buffer header w/superblock */
+ struct mutex bfs_lock;
};
/*
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 034950cb3cb..ed8feb052df 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -32,16 +32,17 @@ static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
struct inode *dir = f->f_path.dentry->d_inode;
struct buffer_head *bh;
struct bfs_dirent *de;
+ struct bfs_sb_info *info = BFS_SB(dir->i_sb);
unsigned int offset;
int block;
- lock_kernel();
+ mutex_lock(&info->bfs_lock);
if (f->f_pos & (BFS_DIRENT_SIZE - 1)) {
printf("Bad f_pos=%08lx for %s:%08lx\n",
(unsigned long)f->f_pos,
dir->i_sb->s_id, dir->i_ino);
- unlock_kernel();
+ mutex_unlock(&info->bfs_lock);
return -EBADF;
}
@@ -61,7 +62,7 @@ static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
le16_to_cpu(de->ino),
DT_UNKNOWN) < 0) {
brelse(bh);
- unlock_kernel();
+ mutex_unlock(&info->bfs_lock);
return 0;
}
}
@@ -71,7 +72,7 @@ static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
brelse(bh);
}
- unlock_kernel();
+ mutex_unlock(&info->bfs_lock);
return 0;
}
@@ -95,10 +96,10 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, int mode,
inode = new_inode(s);
if (!inode)
return -ENOSPC;
- lock_kernel();
+ mutex_lock(&info->bfs_lock);
ino = find_first_zero_bit(info->si_imap, info->si_lasti);
if (ino > info->si_lasti) {
- unlock_kernel();
+ mutex_unlock(&info->bfs_lock);
iput(inode);
return -ENOSPC;
}
@@ -124,11 +125,11 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, int mode,
inode->i_ino);
if (err) {
inode_dec_link_count(inode);
+ mutex_unlock(&info->bfs_lock);
iput(inode);
- unlock_kernel();
return err;
}
- unlock_kernel();
+ mutex_unlock(&info->bfs_lock);
d_instantiate(dentry, inode);
return 0;
}
@@ -139,22 +140,23 @@ static struct dentry *bfs_lookup(struct inode *dir, struct dentry *dentry,
struct inode *inode = NULL;
struct buffer_head *bh;
struct bfs_dirent *de;
+ struct bfs_sb_info *info = BFS_SB(dir->i_sb);
if (dentry->d_name.len > BFS_NAMELEN)
return ERR_PTR(-ENAMETOOLONG);
- lock_kernel();
+ mutex_lock(&info->bfs_lock);
bh = bfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, &de);
if (bh) {
unsigned long ino = (unsigned long)le16_to_cpu(de->ino);
brelse(bh);
inode = bfs_iget(dir->i_sb, ino);
if (IS_ERR(inode)) {
- unlock_kernel();
+ mutex_unlock(&info->bfs_lock);
return ERR_CAST(inode);
}
}
- unlock_kernel();
+ mutex_unlock(&info->bfs_lock);
d_add(dentry, inode);
return NULL;
}
@@ -163,13 +165,14 @@ static int bfs_link(struct dentry *old, struct inode *dir,
struct dentry *new)
{
struct inode *inode = old->d_inode;
+ struct bfs_sb_info *info = BFS_SB(inode->i_sb);
int err;
- lock_kernel();
+ mutex_lock(&info->bfs_lock);
err = bfs_add_entry(dir, new->d_name.name, new->d_name.len,
inode->i_ino);
if (err) {
- unlock_kernel();
+ mutex_unlock(&info->bfs_lock);
return err;
}
inc_nlink(inode);
@@ -177,19 +180,19 @@ static int bfs_link(struct dentry *old, struct inode *dir,
mark_inode_dirty(inode);
atomic_inc(&inode->i_count);
d_instantiate(new, inode);
- unlock_kernel();
+ mutex_unlock(&info->bfs_lock);
return 0;
}
static int bfs_unlink(struct inode *dir, struct dentry *dentry)
{
int error = -ENOENT;
- struct inode *inode;
+ struct inode *inode = dentry->d_inode;
struct buffer_head *bh;
struct bfs_dirent *de;
+ struct bfs_sb_info *info = BFS_SB(inode->i_sb);
- inode = dentry->d_inode;
- lock_kernel();
+ mutex_lock(&info->bfs_lock);
bh = bfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, &de);
if (!bh || (le16_to_cpu(de->ino) != inode->i_ino))
goto out_brelse;
@@ -210,7 +213,7 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry)
out_brelse:
brelse(bh);
- unlock_kernel();
+ mutex_unlock(&info->bfs_lock);
return error;
}
@@ -220,6 +223,7 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *old_inode, *new_inode;
struct buffer_head *old_bh, *new_bh;
struct bfs_dirent *old_de, *new_de;
+ struct bfs_sb_info *info;
int error = -ENOENT;
old_bh = new_bh = NULL;
@@ -227,7 +231,9 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (S_ISDIR(old_inode->i_mode))
return -EINVAL;
- lock_kernel();
+ info = BFS_SB(old_inode->i_sb);
+
+ mutex_lock(&info->bfs_lock);
old_bh = bfs_find_entry(old_dir,
old_dentry->d_name.name,
old_dentry->d_name.len, &old_de);
@@ -264,7 +270,7 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
error = 0;
end_rename:
- unlock_kernel();
+ mutex_unlock(&info->bfs_lock);
brelse(old_bh);
brelse(new_bh);
return error;
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index b11e63e8fbc..6a021265f01 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -99,7 +99,7 @@ static int bfs_get_block(struct inode *inode, sector_t block,
return -ENOSPC;
/* The rest has to be protected against itself. */
- lock_kernel();
+ mutex_lock(&info->bfs_lock);
/*
* If the last data block for this file is the last allocated
@@ -151,7 +151,7 @@ static int bfs_get_block(struct inode *inode, sector_t block,
mark_buffer_dirty(sbh);
map_bh(bh_result, sb, phys);
out:
- unlock_kernel();
+ mutex_unlock(&info->bfs_lock);
return err;
}
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 8db623838b5..0ed57b5ee01 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -104,6 +104,7 @@ static int bfs_write_inode(struct inode *inode, int unused)
struct bfs_inode *di;
struct buffer_head *bh;
int block, off;
+ struct bfs_sb_info *info = BFS_SB(inode->i_sb);
dprintf("ino=%08x\n", ino);
@@ -112,13 +113,13 @@ static int bfs_write_inode(struct inode *inode, int unused)
return -EIO;
}
- lock_kernel();
+ mutex_lock(&info->bfs_lock);
block = (ino - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1;
bh = sb_bread(inode->i_sb, block);
if (!bh) {
printf("Unable to read inode %s:%08x\n",
inode->i_sb->s_id, ino);
- unlock_kernel();
+ mutex_unlock(&info->bfs_lock);
return -EIO;
}
@@ -145,7 +146,7 @@ static int bfs_write_inode(struct inode *inode, int unused)
mark_buffer_dirty(bh);
brelse(bh);
- unlock_kernel();
+ mutex_unlock(&info->bfs_lock);
return 0;
}
@@ -170,7 +171,7 @@ static void bfs_delete_inode(struct inode *inode)
inode->i_size = 0;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
- lock_kernel();
+ mutex_lock(&info->bfs_lock);
mark_inode_dirty(inode);
block = (ino - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1;
@@ -178,7 +179,7 @@ static void bfs_delete_inode(struct inode *inode)
if (!bh) {
printf("Unable to read inode %s:%08lx\n",
inode->i_sb->s_id, ino);
- unlock_kernel();
+ mutex_unlock(&info->bfs_lock);
return;
}
off = (ino - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK;
@@ -204,14 +205,16 @@ static void bfs_delete_inode(struct inode *inode)
info->si_lf_eblk = bi->i_sblock - 1;
mark_buffer_dirty(info->si_sbh);
}
- unlock_kernel();
+ mutex_unlock(&info->bfs_lock);
clear_inode(inode);
}
static void bfs_put_super(struct super_block *s)
{
struct bfs_sb_info *info = BFS_SB(s);
+
brelse(info->si_sbh);
+ mutex_destroy(&info->bfs_lock);
kfree(info->si_imap);
kfree(info);
s->s_fs_info = NULL;
@@ -236,11 +239,13 @@ static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf)
static void bfs_write_super(struct super_block *s)
{
- lock_kernel();
+ struct bfs_sb_info *info = BFS_SB(s);
+
+ mutex_lock(&info->bfs_lock);
if (!(s->s_flags & MS_RDONLY))
- mark_buffer_dirty(BFS_SB(s)->si_sbh);
+ mark_buffer_dirty(info->si_sbh);
s->s_dirt = 0;
- unlock_kernel();
+ mutex_unlock(&info->bfs_lock);
}
static struct kmem_cache *bfs_inode_cachep;
@@ -259,7 +264,7 @@ static void bfs_destroy_inode(struct inode *inode)
kmem_cache_free(bfs_inode_cachep, BFS_I(inode));
}
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
{
struct bfs_inode_info *bi = foo;
@@ -380,7 +385,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
struct bfs_inode *di;
int block = (i - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1;
int off = (i - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK;
- unsigned long sblock, eblock;
+ unsigned long eblock;
if (!off) {
brelse(bh);
@@ -399,7 +404,6 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
set_bit(i, info->si_imap);
info->si_freeb -= BFS_FILEBLOCKS(di);
- sblock = le32_to_cpu(di->i_sblock);
eblock = le32_to_cpu(di->i_eblock);
if (eblock > info->si_lf_eblk)
info->si_lf_eblk = eblock;
@@ -410,6 +414,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
s->s_dirt = 1;
}
dump_imap("read_super", s);
+ mutex_init(&info->bfs_lock);
return 0;
out:
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index ba4cddb92f1..204cfd1d767 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -444,12 +444,6 @@ beyond_if:
regs->gp = ex.a_gpvalue;
#endif
start_thread(regs, ex.a_entry, current->mm->start_stack);
- if (unlikely(current->ptrace & PT_PTRACED)) {
- if (current->ptrace & PT_TRACE_EXEC)
- ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP);
- else
- send_sig(SIGTRAP, current, 0);
- }
return 0;
}
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index d48ff5f370f..655ed8d30a8 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -131,6 +131,15 @@ static int padzero(unsigned long elf_bss)
#define STACK_ALLOC(sp, len) ({ sp -= len ; sp; })
#endif
+#ifndef ELF_BASE_PLATFORM
+/*
+ * AT_BASE_PLATFORM indicates the "real" hardware/microarchitecture.
+ * If the arch defines ELF_BASE_PLATFORM (in asm/elf.h), the value
+ * will be copied to the user stack in the same manner as AT_PLATFORM.
+ */
+#define ELF_BASE_PLATFORM NULL
+#endif
+
static int
create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
unsigned long load_addr, unsigned long interp_load_addr)
@@ -142,7 +151,9 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
elf_addr_t __user *envp;
elf_addr_t __user *sp;
elf_addr_t __user *u_platform;
+ elf_addr_t __user *u_base_platform;
const char *k_platform = ELF_PLATFORM;
+ const char *k_base_platform = ELF_BASE_PLATFORM;
int items;
elf_addr_t *elf_info;
int ei_index = 0;
@@ -172,6 +183,19 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
return -EFAULT;
}
+ /*
+ * If this architecture has a "base" platform capability
+ * string, copy it to userspace.
+ */
+ u_base_platform = NULL;
+ if (k_base_platform) {
+ size_t len = strlen(k_base_platform) + 1;
+
+ u_base_platform = (elf_addr_t __user *)STACK_ALLOC(p, len);
+ if (__copy_to_user(u_base_platform, k_base_platform, len))
+ return -EFAULT;
+ }
+
/* Create the ELF interpreter info */
elf_info = (elf_addr_t *)current->mm->saved_auxv;
/* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */
@@ -204,10 +228,15 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
NEW_AUX_ENT(AT_GID, tsk->gid);
NEW_AUX_ENT(AT_EGID, tsk->egid);
NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm));
+ NEW_AUX_ENT(AT_EXECFN, bprm->exec);
if (k_platform) {
NEW_AUX_ENT(AT_PLATFORM,
(elf_addr_t)(unsigned long)u_platform);
}
+ if (k_base_platform) {
+ NEW_AUX_ENT(AT_BASE_PLATFORM,
+ (elf_addr_t)(unsigned long)u_base_platform);
+ }
if (bprm->interp_flags & BINPRM_FLAGS_EXECFD) {
NEW_AUX_ENT(AT_EXECFD, bprm->interp_data);
}
@@ -974,12 +1003,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
#endif
start_thread(regs, elf_entry, bprm->p);
- if (unlikely(current->ptrace & PT_PTRACED)) {
- if (current->ptrace & PT_TRACE_EXEC)
- ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP);
- else
- send_sig(SIGTRAP, current, 0);
- }
retval = 0;
out:
kfree(loc);
@@ -1477,7 +1500,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
const struct user_regset_view *view = task_user_regset_view(dump_task);
struct elf_thread_core_info *t;
struct elf_prpsinfo *psinfo;
- struct task_struct *g, *p;
+ struct core_thread *ct;
unsigned int i;
info->size = 0;
@@ -1516,31 +1539,26 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
/*
* Allocate a structure for each thread.
*/
- rcu_read_lock();
- do_each_thread(g, p)
- if (p->mm == dump_task->mm) {
- t = kzalloc(offsetof(struct elf_thread_core_info,
- notes[info->thread_notes]),
- GFP_ATOMIC);
- if (unlikely(!t)) {
- rcu_read_unlock();
- return 0;
- }
- t->task = p;
- if (p == dump_task || !info->thread) {
- t->next = info->thread;
- info->thread = t;
- } else {
- /*
- * Make sure to keep the original task at
- * the head of the list.
- */
- t->next = info->thread->next;
- info->thread->next = t;
- }
+ for (ct = &dump_task->mm->core_state->dumper; ct; ct = ct->next) {
+ t = kzalloc(offsetof(struct elf_thread_core_info,
+ notes[info->thread_notes]),
+ GFP_KERNEL);
+ if (unlikely(!t))
+ return 0;
+
+ t->task = ct->task;
+ if (ct->task == dump_task || !info->thread) {
+ t->next = info->thread;
+ info->thread = t;
+ } else {
+ /*
+ * Make sure to keep the original task at
+ * the head of the list.
+ */
+ t->next = info->thread->next;
+ info->thread->next = t;
}
- while_each_thread(g, p);
- rcu_read_unlock();
+ }
/*
* Now fill in each thread's information.
@@ -1687,7 +1705,6 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
{
#define NUM_NOTES 6
struct list_head *t;
- struct task_struct *g, *p;
info->notes = NULL;
info->prstatus = NULL;
@@ -1719,20 +1736,19 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
info->thread_status_size = 0;
if (signr) {
+ struct core_thread *ct;
struct elf_thread_status *ets;
- rcu_read_lock();
- do_each_thread(g, p)
- if (current->mm == p->mm && current != p) {
- ets = kzalloc(sizeof(*ets), GFP_ATOMIC);
- if (!ets) {
- rcu_read_unlock();
- return 0;
- }
- ets->thread = p;
- list_add(&ets->list, &info->thread_list);
- }
- while_each_thread(g, p);
- rcu_read_unlock();
+
+ for (ct = current->mm->core_state->dumper.next;
+ ct; ct = ct->next) {
+ ets = kzalloc(sizeof(*ets), GFP_KERNEL);
+ if (!ets)
+ return 0;
+
+ ets->thread = ct->task;
+ list_add(&ets->list, &info->thread_list);
+ }
+
list_for_each(t, &info->thread_list) {
int sz;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index d051a32e627..80c1f952ef7 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -433,13 +433,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
entryaddr = interp_params.entry_addr ?: exec_params.entry_addr;
start_thread(regs, entryaddr, current->mm->start_stack);
- if (unlikely(current->ptrace & PT_PTRACED)) {
- if (current->ptrace & PT_TRACE_EXEC)
- ptrace_notify((PTRACE_EVENT_EXEC << 8) | SIGTRAP);
- else
- send_sig(SIGTRAP, current, 0);
- }
-
retval = 0;
error:
@@ -477,6 +470,7 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
char __user *u_platform, *p;
long hwcap;
int loop;
+ int nr; /* reset for each csp adjustment */
/* we're going to shovel a whole load of stuff onto the stack */
#ifdef CONFIG_MMU
@@ -549,10 +543,7 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
/* force 16 byte _final_ alignment here for generality */
#define DLINFO_ITEMS 13
- nitems = 1 + DLINFO_ITEMS + (k_platform ? 1 : 0);
-#ifdef DLINFO_ARCH_ITEMS
- nitems += DLINFO_ARCH_ITEMS;
-#endif
+ nitems = 1 + DLINFO_ITEMS + (k_platform ? 1 : 0) + AT_VECTOR_SIZE_ARCH;
csp = sp;
sp -= nitems * 2 * sizeof(unsigned long);
@@ -564,39 +555,46 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
sp -= sp & 15UL;
/* put the ELF interpreter info on the stack */
-#define NEW_AUX_ENT(nr, id, val) \
+#define NEW_AUX_ENT(id, val) \
do { \
struct { unsigned long _id, _val; } __user *ent; \
\
ent = (void __user *) csp; \
__put_user((id), &ent[nr]._id); \
__put_user((val), &ent[nr]._val); \
+ nr++; \
} while (0)
+ nr = 0;
csp -= 2 * sizeof(unsigned long);
- NEW_AUX_ENT(0, AT_NULL, 0);
+ NEW_AUX_ENT(AT_NULL, 0);
if (k_platform) {
+ nr = 0;
csp -= 2 * sizeof(unsigned long);
- NEW_AUX_ENT(0, AT_PLATFORM,
+ NEW_AUX_ENT(AT_PLATFORM,
(elf_addr_t) (unsigned long) u_platform);
}
+ nr = 0;
csp -= DLINFO_ITEMS * 2 * sizeof(unsigned long);
- NEW_AUX_ENT( 0, AT_HWCAP, hwcap);
- NEW_AUX_ENT( 1, AT_PAGESZ, PAGE_SIZE);
- NEW_AUX_ENT( 2, AT_CLKTCK, CLOCKS_PER_SEC);
- NEW_AUX_ENT( 3, AT_PHDR, exec_params->ph_addr);
- NEW_AUX_ENT( 4, AT_PHENT, sizeof(struct elf_phdr));
- NEW_AUX_ENT( 5, AT_PHNUM, exec_params->hdr.e_phnum);
- NEW_AUX_ENT( 6, AT_BASE, interp_params->elfhdr_addr);
- NEW_AUX_ENT( 7, AT_FLAGS, 0);
- NEW_AUX_ENT( 8, AT_ENTRY, exec_params->entry_addr);
- NEW_AUX_ENT( 9, AT_UID, (elf_addr_t) current->uid);
- NEW_AUX_ENT(10, AT_EUID, (elf_addr_t) current->euid);
- NEW_AUX_ENT(11, AT_GID, (elf_addr_t) current->gid);
- NEW_AUX_ENT(12, AT_EGID, (elf_addr_t) current->egid);
+ NEW_AUX_ENT(AT_HWCAP, hwcap);
+ NEW_AUX_ENT(AT_PAGESZ, PAGE_SIZE);
+ NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC);
+ NEW_AUX_ENT(AT_PHDR, exec_params->ph_addr);
+ NEW_AUX_ENT(AT_PHENT, sizeof(struct elf_phdr));
+ NEW_AUX_ENT(AT_PHNUM, exec_params->hdr.e_phnum);
+ NEW_AUX_ENT(AT_BASE, interp_params->elfhdr_addr);
+ NEW_AUX_ENT(AT_FLAGS, 0);
+ NEW_AUX_ENT(AT_ENTRY, exec_params->entry_addr);
+ NEW_AUX_ENT(AT_UID, (elf_addr_t) current->uid);
+ NEW_AUX_ENT(AT_EUID, (elf_addr_t) current->euid);
+ NEW_AUX_ENT(AT_GID, (elf_addr_t) current->gid);
+ NEW_AUX_ENT(AT_EGID, (elf_addr_t) current->egid);
#ifdef ARCH_DLINFO
+ nr = 0;
+ csp -= AT_VECTOR_SIZE_ARCH * 2 * sizeof(unsigned long);
+
/* ARCH_DLINFO must come last so platform specific code can enforce
* special alignment requirements on the AUXV if necessary (eg. PPC).
*/
@@ -1573,7 +1571,6 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
struct memelfnote *notes = NULL;
struct elf_prstatus *prstatus = NULL; /* NT_PRSTATUS */
struct elf_prpsinfo *psinfo = NULL; /* NT_PRPSINFO */
- struct task_struct *g, *p;
LIST_HEAD(thread_list);
struct list_head *t;
elf_fpregset_t *fpu = NULL;
@@ -1622,20 +1619,19 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
#endif
if (signr) {
+ struct core_thread *ct;
struct elf_thread_status *tmp;
- rcu_read_lock();
- do_each_thread(g,p)
- if (current->mm == p->mm && current != p) {
- tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC);
- if (!tmp) {
- rcu_read_unlock();
- goto cleanup;
- }
- tmp->thread = p;
- list_add(&tmp->list, &thread_list);
- }
- while_each_thread(g,p);
- rcu_read_unlock();
+
+ for (ct = current->mm->core_state->dumper.next;
+ ct; ct = ct->next) {
+ tmp = kzalloc(sizeof(*tmp), GFP_KERNEL);
+ if (!tmp)
+ goto cleanup;
+
+ tmp->thread = ct->task;
+ list_add(&tmp->list, &thread_list);
+ }
+
list_for_each(t, &thread_list) {
struct elf_thread_status *tmp;
int sz;
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 2cb1acda3a8..dfc0197905c 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -914,15 +914,14 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
/* Stash our initial stack pointer into the mm structure */
current->mm->start_stack = (unsigned long )sp;
-
+#ifdef FLAT_PLAT_INIT
+ FLAT_PLAT_INIT(regs);
+#endif
DBG_FLT("start_thread(regs=0x%x, entry=0x%x, start_stack=0x%x)\n",
(int)regs, (int)start_addr, (int)current->mm->start_stack);
start_thread(regs, start_addr, current->mm->start_stack);
- if (current->ptrace & PT_PTRACED)
- send_sig(SIGTRAP, current, 0);
-
return 0;
}
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 7191306367c..8d7e88e02e0 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -27,6 +27,7 @@
#include <linux/namei.h>
#include <linux/mount.h>
#include <linux/syscalls.h>
+#include <linux/fs.h>
#include <asm/uaccess.h>
@@ -119,8 +120,6 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
if (bprm->misc_bang)
goto _ret;
- bprm->misc_bang = 1;
-
/* to keep locking time low, we copy the interpreter string */
read_lock(&entries_lock);
fmt = check_file(bprm);
@@ -198,6 +197,8 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
if (retval < 0)
goto _error;
+ bprm->misc_bang = 1;
+
retval = search_binary_handler (bprm, regs);
if (retval < 0)
goto _error;
@@ -535,31 +536,16 @@ static ssize_t
bm_entry_read(struct file * file, char __user * buf, size_t nbytes, loff_t *ppos)
{
Node *e = file->f_path.dentry->d_inode->i_private;
- loff_t pos = *ppos;
ssize_t res;
char *page;
- int len;
if (!(page = (char*) __get_free_page(GFP_KERNEL)))
return -ENOMEM;
entry_status(e, page);
- len = strlen(page);
- res = -EINVAL;
- if (pos < 0)
- goto out;
- res = 0;
- if (pos >= len)
- goto out;
- if (len < pos + nbytes)
- nbytes = len - pos;
- res = -EFAULT;
- if (copy_to_user(buf, page + pos, nbytes))
- goto out;
- *ppos = pos + nbytes;
- res = nbytes;
-out:
+ res = simple_read_from_buffer(buf, nbytes, ppos, page, strlen(page));
+
free_page((unsigned long) page);
return res;
}
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index fdc36bfd6a7..68be580ba28 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -274,8 +274,6 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
map_hpux_gateway_page(current,current->mm);
start_thread_som(regs, som_entry, bprm->p);
- if (current->ptrace & PT_PTRACED)
- send_sig(SIGTRAP, current, 0);
return 0;
/* error cleanup */
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
new file mode 100644
index 00000000000..19caf7c962a
--- /dev/null
+++ b/fs/bio-integrity.c
@@ -0,0 +1,745 @@
+/*
+ * bio-integrity.c - bio data integrity extensions
+ *
+ * Copyright (C) 2007, 2008 Oracle Corporation
+ * Written by: Martin K. Petersen <martin.petersen@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING. If not, write to
+ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
+ * USA.
+ *
+ */
+
+#include <linux/blkdev.h>
+#include <linux/mempool.h>
+#include <linux/bio.h>
+#include <linux/workqueue.h>
+
+static struct kmem_cache *bio_integrity_slab __read_mostly;
+static struct workqueue_struct *kintegrityd_wq;
+
+/**
+ * bio_integrity_alloc_bioset - Allocate integrity payload and attach it to bio
+ * @bio: bio to attach integrity metadata to
+ * @gfp_mask: Memory allocation mask
+ * @nr_vecs: Number of integrity metadata scatter-gather elements
+ * @bs: bio_set to allocate from
+ *
+ * Description: This function prepares a bio for attaching integrity
+ * metadata. nr_vecs specifies the maximum number of pages containing
+ * integrity metadata that can be attached.
+ */
+struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
+ gfp_t gfp_mask,
+ unsigned int nr_vecs,
+ struct bio_set *bs)
+{
+ struct bio_integrity_payload *bip;
+ struct bio_vec *iv;
+ unsigned long idx;
+
+ BUG_ON(bio == NULL);
+
+ bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
+ if (unlikely(bip == NULL)) {
+ printk(KERN_ERR "%s: could not alloc bip\n", __func__);
+ return NULL;
+ }
+
+ memset(bip, 0, sizeof(*bip));
+
+ iv = bvec_alloc_bs(gfp_mask, nr_vecs, &idx, bs);
+ if (unlikely(iv == NULL)) {
+ printk(KERN_ERR "%s: could not alloc bip_vec\n", __func__);
+ mempool_free(bip, bs->bio_integrity_pool);
+ return NULL;
+ }
+
+ bip->bip_pool = idx;
+ bip->bip_vec = iv;
+ bip->bip_bio = bio;
+ bio->bi_integrity = bip;
+
+ return bip;
+}
+EXPORT_SYMBOL(bio_integrity_alloc_bioset);
+
+/**
+ * bio_integrity_alloc - Allocate integrity payload and attach it to bio
+ * @bio: bio to attach integrity metadata to
+ * @gfp_mask: Memory allocation mask
+ * @nr_vecs: Number of integrity metadata scatter-gather elements
+ *
+ * Description: This function prepares a bio for attaching integrity
+ * metadata. nr_vecs specifies the maximum number of pages containing
+ * integrity metadata that can be attached.
+ */
+struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
+ gfp_t gfp_mask,
+ unsigned int nr_vecs)
+{
+ return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set);
+}
+EXPORT_SYMBOL(bio_integrity_alloc);
+
+/**
+ * bio_integrity_free - Free bio integrity payload
+ * @bio: bio containing bip to be freed
+ * @bs: bio_set this bio was allocated from
+ *
+ * Description: Used to free the integrity portion of a bio. Usually
+ * called from bio_free().
+ */
+void bio_integrity_free(struct bio *bio, struct bio_set *bs)
+{
+ struct bio_integrity_payload *bip = bio->bi_integrity;
+
+ BUG_ON(bip == NULL);
+
+ /* A cloned bio doesn't own the integrity metadata */
+ if (!bio_flagged(bio, BIO_CLONED) && !bio_flagged(bio, BIO_FS_INTEGRITY)
+ && bip->bip_buf != NULL)
+ kfree(bip->bip_buf);
+
+ mempool_free(bip->bip_vec, bs->bvec_pools[bip->bip_pool]);
+ mempool_free(bip, bs->bio_integrity_pool);
+
+ bio->bi_integrity = NULL;
+}
+EXPORT_SYMBOL(bio_integrity_free);
+
+/**
+ * bio_integrity_add_page - Attach integrity metadata
+ * @bio: bio to update
+ * @page: page containing integrity metadata
+ * @len: number of bytes of integrity metadata in page
+ * @offset: start offset within page
+ *
+ * Description: Attach a page containing integrity metadata to bio.
+ */
+int bio_integrity_add_page(struct bio *bio, struct page *page,
+ unsigned int len, unsigned int offset)
+{
+ struct bio_integrity_payload *bip = bio->bi_integrity;
+ struct bio_vec *iv;
+
+ if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_pool)) {
+ printk(KERN_ERR "%s: bip_vec full\n", __func__);
+ return 0;
+ }
+
+ iv = bip_vec_idx(bip, bip->bip_vcnt);
+ BUG_ON(iv == NULL);
+ BUG_ON(iv->bv_page != NULL);
+
+ iv->bv_page = page;
+ iv->bv_len = len;
+ iv->bv_offset = offset;
+ bip->bip_vcnt++;
+
+ return len;
+}
+EXPORT_SYMBOL(bio_integrity_add_page);
+
+static int bdev_integrity_enabled(struct block_device *bdev, int rw)
+{
+ struct blk_integrity *bi = bdev_get_integrity(bdev);
+
+ if (bi == NULL)
+ return 0;
+
+ if (rw == READ && bi->verify_fn != NULL &&
+ (bi->flags & INTEGRITY_FLAG_READ))
+ return 1;
+
+ if (rw == WRITE && bi->generate_fn != NULL &&
+ (bi->flags & INTEGRITY_FLAG_WRITE))
+ return 1;
+
+ return 0;
+}
+
+/**
+ * bio_integrity_enabled - Check whether integrity can be passed
+ * @bio: bio to check
+ *
+ * Description: Determines whether bio_integrity_prep() can be called
+ * on this bio or not. bio data direction and target device must be
+ * set prior to calling. The functions honors the write_generate and
+ * read_verify flags in sysfs.
+ */
+int bio_integrity_enabled(struct bio *bio)
+{
+ /* Already protected? */
+ if (bio_integrity(bio))
+ return 0;
+
+ return bdev_integrity_enabled(bio->bi_bdev, bio_data_dir(bio));
+}
+EXPORT_SYMBOL(bio_integrity_enabled);
+
+/**
+ * bio_integrity_hw_sectors - Convert 512b sectors to hardware ditto
+ * @bi: blk_integrity profile for device
+ * @sectors: Number of 512 sectors to convert
+ *
+ * Description: The block layer calculates everything in 512 byte
+ * sectors but integrity metadata is done in terms of the hardware
+ * sector size of the storage device. Convert the block layer sectors
+ * to physical sectors.
+ */
+static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi,
+ unsigned int sectors)
+{
+ /* At this point there are only 512b or 4096b DIF/EPP devices */
+ if (bi->sector_size == 4096)
+ return sectors >>= 3;
+
+ return sectors;
+}
+
+/**
+ * bio_integrity_tag_size - Retrieve integrity tag space
+ * @bio: bio to inspect
+ *
+ * Description: Returns the maximum number of tag bytes that can be
+ * attached to this bio. Filesystems can use this to determine how
+ * much metadata to attach to an I/O.
+ */
+unsigned int bio_integrity_tag_size(struct bio *bio)
+{
+ struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+
+ BUG_ON(bio->bi_size == 0);
+
+ return bi->tag_size * (bio->bi_size / bi->sector_size);
+}
+EXPORT_SYMBOL(bio_integrity_tag_size);
+
+int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, int set)
+{
+ struct bio_integrity_payload *bip = bio->bi_integrity;
+ struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+ unsigned int nr_sectors;
+
+ BUG_ON(bip->bip_buf == NULL);
+
+ if (bi->tag_size == 0)
+ return -1;
+
+ nr_sectors = bio_integrity_hw_sectors(bi,
+ DIV_ROUND_UP(len, bi->tag_size));
+
+ if (nr_sectors * bi->tuple_size > bip->bip_size) {
+ printk(KERN_ERR "%s: tag too big for bio: %u > %u\n",
+ __func__, nr_sectors * bi->tuple_size, bip->bip_size);
+ return -1;
+ }
+
+ if (set)
+ bi->set_tag_fn(bip->bip_buf, tag_buf, nr_sectors);
+ else
+ bi->get_tag_fn(bip->bip_buf, tag_buf, nr_sectors);
+
+ return 0;
+}
+
+/**
+ * bio_integrity_set_tag - Attach a tag buffer to a bio
+ * @bio: bio to attach buffer to
+ * @tag_buf: Pointer to a buffer containing tag data
+ * @len: Length of the included buffer
+ *
+ * Description: Use this function to tag a bio by leveraging the extra
+ * space provided by devices formatted with integrity protection. The
+ * size of the integrity buffer must be <= to the size reported by
+ * bio_integrity_tag_size().
+ */
+int bio_integrity_set_tag(struct bio *bio, void *tag_buf, unsigned int len)
+{
+ BUG_ON(bio_data_dir(bio) != WRITE);
+
+ return bio_integrity_tag(bio, tag_buf, len, 1);
+}
+EXPORT_SYMBOL(bio_integrity_set_tag);
+
+/**
+ * bio_integrity_get_tag - Retrieve a tag buffer from a bio
+ * @bio: bio to retrieve buffer from
+ * @tag_buf: Pointer to a buffer for the tag data
+ * @len: Length of the target buffer
+ *
+ * Description: Use this function to retrieve the tag buffer from a
+ * completed I/O. The size of the integrity buffer must be <= to the
+ * size reported by bio_integrity_tag_size().
+ */
+int bio_integrity_get_tag(struct bio *bio, void *tag_buf, unsigned int len)
+{
+ BUG_ON(bio_data_dir(bio) != READ);
+
+ return bio_integrity_tag(bio, tag_buf, len, 0);
+}
+EXPORT_SYMBOL(bio_integrity_get_tag);
+
+/**
+ * bio_integrity_generate - Generate integrity metadata for a bio
+ * @bio: bio to generate integrity metadata for
+ *
+ * Description: Generates integrity metadata for a bio by calling the
+ * block device's generation callback function. The bio must have a
+ * bip attached with enough room to accommodate the generated
+ * integrity metadata.
+ */
+static void bio_integrity_generate(struct bio *bio)
+{
+ struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+ struct blk_integrity_exchg bix;
+ struct bio_vec *bv;
+ sector_t sector = bio->bi_sector;
+ unsigned int i, sectors, total;
+ void *prot_buf = bio->bi_integrity->bip_buf;
+
+ total = 0;
+ bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
+ bix.sector_size = bi->sector_size;
+
+ bio_for_each_segment(bv, bio, i) {
+ void *kaddr = kmap_atomic(bv->bv_page, KM_USER0);
+ bix.data_buf = kaddr + bv->bv_offset;
+ bix.data_size = bv->bv_len;
+ bix.prot_buf = prot_buf;
+ bix.sector = sector;
+
+ bi->generate_fn(&bix);
+
+ sectors = bv->bv_len / bi->sector_size;
+ sector += sectors;
+ prot_buf += sectors * bi->tuple_size;
+ total += sectors * bi->tuple_size;
+ BUG_ON(total > bio->bi_integrity->bip_size);
+
+ kunmap_atomic(kaddr, KM_USER0);
+ }
+}
+
+static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi)
+{
+ if (bi)
+ return bi->tuple_size;
+
+ return 0;
+}
+
+/**
+ * bio_integrity_prep - Prepare bio for integrity I/O
+ * @bio: bio to prepare
+ *
+ * Description: Allocates a buffer for integrity metadata, maps the
+ * pages and attaches them to a bio. The bio must have data
+ * direction, target device and start sector set priot to calling. In
+ * the WRITE case, integrity metadata will be generated using the
+ * block device's integrity function. In the READ case, the buffer
+ * will be prepared for DMA and a suitable end_io handler set up.
+ */
+int bio_integrity_prep(struct bio *bio)
+{
+ struct bio_integrity_payload *bip;
+ struct blk_integrity *bi;
+ struct request_queue *q;
+ void *buf;
+ unsigned long start, end;
+ unsigned int len, nr_pages;
+ unsigned int bytes, offset, i;
+ unsigned int sectors;
+
+ bi = bdev_get_integrity(bio->bi_bdev);
+ q = bdev_get_queue(bio->bi_bdev);
+ BUG_ON(bi == NULL);
+ BUG_ON(bio_integrity(bio));
+
+ sectors = bio_integrity_hw_sectors(bi, bio_sectors(bio));
+
+ /* Allocate kernel buffer for protection data */
+ len = sectors * blk_integrity_tuple_size(bi);
+ buf = kmalloc(len, GFP_NOIO | __GFP_NOFAIL | q->bounce_gfp);
+ if (unlikely(buf == NULL)) {
+ printk(KERN_ERR "could not allocate integrity buffer\n");
+ return -EIO;
+ }
+
+ end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ start = ((unsigned long) buf) >> PAGE_SHIFT;
+ nr_pages = end - start;
+
+ /* Allocate bio integrity payload and integrity vectors */
+ bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages);
+ if (unlikely(bip == NULL)) {
+ printk(KERN_ERR "could not allocate data integrity bioset\n");
+ kfree(buf);
+ return -EIO;
+ }
+
+ bip->bip_buf = buf;
+ bip->bip_size = len;
+ bip->bip_sector = bio->bi_sector;
+
+ /* Map it */
+ offset = offset_in_page(buf);
+ for (i = 0 ; i < nr_pages ; i++) {
+ int ret;
+ bytes = PAGE_SIZE - offset;
+
+ if (len <= 0)
+ break;
+
+ if (bytes > len)
+ bytes = len;
+
+ ret = bio_integrity_add_page(bio, virt_to_page(buf),
+ bytes, offset);
+
+ if (ret == 0)
+ return 0;
+
+ if (ret < bytes)
+ break;
+
+ buf += bytes;
+ len -= bytes;
+ offset = 0;
+ }
+
+ /* Install custom I/O completion handler if read verify is enabled */
+ if (bio_data_dir(bio) == READ) {
+ bip->bip_end_io = bio->bi_end_io;
+ bio->bi_end_io = bio_integrity_endio;
+ }
+
+ /* Auto-generate integrity metadata if this is a write */
+ if (bio_data_dir(bio) == WRITE)
+ bio_integrity_generate(bio);
+
+ return 0;
+}
+EXPORT_SYMBOL(bio_integrity_prep);
+
+/**
+ * bio_integrity_verify - Verify integrity metadata for a bio
+ * @bio: bio to verify
+ *
+ * Description: This function is called to verify the integrity of a
+ * bio. The data in the bio io_vec is compared to the integrity
+ * metadata returned by the HBA.
+ */
+static int bio_integrity_verify(struct bio *bio)
+{
+ struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+ struct blk_integrity_exchg bix;
+ struct bio_vec *bv;
+ sector_t sector = bio->bi_integrity->bip_sector;
+ unsigned int i, sectors, total, ret;
+ void *prot_buf = bio->bi_integrity->bip_buf;
+
+ ret = total = 0;
+ bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
+ bix.sector_size = bi->sector_size;
+
+ bio_for_each_segment(bv, bio, i) {
+ void *kaddr = kmap_atomic(bv->bv_page, KM_USER0);
+ bix.data_buf = kaddr + bv->bv_offset;
+ bix.data_size = bv->bv_len;
+ bix.prot_buf = prot_buf;
+ bix.sector = sector;
+
+ ret = bi->verify_fn(&bix);
+
+ if (ret) {
+ kunmap_atomic(kaddr, KM_USER0);
+ break;
+ }
+
+ sectors = bv->bv_len / bi->sector_size;
+ sector += sectors;
+ prot_buf += sectors * bi->tuple_size;
+ total += sectors * bi->tuple_size;
+ BUG_ON(total > bio->bi_integrity->bip_size);
+
+ kunmap_atomic(kaddr, KM_USER0);
+ }
+
+ return ret;
+}
+
+/**
+ * bio_integrity_verify_fn - Integrity I/O completion worker
+ * @work: Work struct stored in bio to be verified
+ *
+ * Description: This workqueue function is called to complete a READ
+ * request. The function verifies the transferred integrity metadata
+ * and then calls the original bio end_io function.
+ */
+static void bio_integrity_verify_fn(struct work_struct *work)
+{
+ struct bio_integrity_payload *bip =
+ container_of(work, struct bio_integrity_payload, bip_work);
+ struct bio *bio = bip->bip_bio;
+ int error = bip->bip_error;
+
+ if (bio_integrity_verify(bio)) {
+ clear_bit(BIO_UPTODATE, &bio->bi_flags);
+ error = -EIO;
+ }
+
+ /* Restore original bio completion handler */
+ bio->bi_end_io = bip->bip_end_io;
+
+ if (bio->bi_end_io)
+ bio->bi_end_io(bio, error);
+}
+
+/**
+ * bio_integrity_endio - Integrity I/O completion function
+ * @bio: Protected bio
+ * @error: Pointer to errno
+ *
+ * Description: Completion for integrity I/O
+ *
+ * Normally I/O completion is done in interrupt context. However,
+ * verifying I/O integrity is a time-consuming task which must be run
+ * in process context. This function postpones completion
+ * accordingly.
+ */
+void bio_integrity_endio(struct bio *bio, int error)
+{
+ struct bio_integrity_payload *bip = bio->bi_integrity;
+
+ BUG_ON(bip->bip_bio != bio);
+
+ bip->bip_error = error;
+ INIT_WORK(&bip->bip_work, bio_integrity_verify_fn);
+ queue_work(kintegrityd_wq, &bip->bip_work);
+}
+EXPORT_SYMBOL(bio_integrity_endio);
+
+/**
+ * bio_integrity_mark_head - Advance bip_vec skip bytes
+ * @bip: Integrity vector to advance
+ * @skip: Number of bytes to advance it
+ */
+void bio_integrity_mark_head(struct bio_integrity_payload *bip,
+ unsigned int skip)
+{
+ struct bio_vec *iv;
+ unsigned int i;
+
+ bip_for_each_vec(iv, bip, i) {
+ if (skip == 0) {
+ bip->bip_idx = i;
+ return;
+ } else if (skip >= iv->bv_len) {
+ skip -= iv->bv_len;
+ } else { /* skip < iv->bv_len) */
+ iv->bv_offset += skip;
+ iv->bv_len -= skip;
+ bip->bip_idx = i;
+ return;
+ }
+ }
+}
+
+/**
+ * bio_integrity_mark_tail - Truncate bip_vec to be len bytes long
+ * @bip: Integrity vector to truncate
+ * @len: New length of integrity vector
+ */
+void bio_integrity_mark_tail(struct bio_integrity_payload *bip,
+ unsigned int len)
+{
+ struct bio_vec *iv;
+ unsigned int i;
+
+ bip_for_each_vec(iv, bip, i) {
+ if (len == 0) {
+ bip->bip_vcnt = i;
+ return;
+ } else if (len >= iv->bv_len) {
+ len -= iv->bv_len;
+ } else { /* len < iv->bv_len) */
+ iv->bv_len = len;
+ len = 0;
+ }
+ }
+}
+
+/**
+ * bio_integrity_advance - Advance integrity vector
+ * @bio: bio whose integrity vector to update
+ * @bytes_done: number of data bytes that have been completed
+ *
+ * Description: This function calculates how many integrity bytes the
+ * number of completed data bytes correspond to and advances the
+ * integrity vector accordingly.
+ */
+void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
+{
+ struct bio_integrity_payload *bip = bio->bi_integrity;
+ struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+ unsigned int nr_sectors;
+
+ BUG_ON(bip == NULL);
+ BUG_ON(bi == NULL);
+
+ nr_sectors = bio_integrity_hw_sectors(bi, bytes_done >> 9);
+ bio_integrity_mark_head(bip, nr_sectors * bi->tuple_size);
+}
+EXPORT_SYMBOL(bio_integrity_advance);
+
+/**
+ * bio_integrity_trim - Trim integrity vector
+ * @bio: bio whose integrity vector to update
+ * @offset: offset to first data sector
+ * @sectors: number of data sectors
+ *
+ * Description: Used to trim the integrity vector in a cloned bio.
+ * The ivec will be advanced corresponding to 'offset' data sectors
+ * and the length will be truncated corresponding to 'len' data
+ * sectors.
+ */
+void bio_integrity_trim(struct bio *bio, unsigned int offset,
+ unsigned int sectors)
+{
+ struct bio_integrity_payload *bip = bio->bi_integrity;
+ struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+ unsigned int nr_sectors;
+
+ BUG_ON(bip == NULL);
+ BUG_ON(bi == NULL);
+ BUG_ON(!bio_flagged(bio, BIO_CLONED));
+
+ nr_sectors = bio_integrity_hw_sectors(bi, sectors);
+ bip->bip_sector = bip->bip_sector + offset;
+ bio_integrity_mark_head(bip, offset * bi->tuple_size);
+ bio_integrity_mark_tail(bip, sectors * bi->tuple_size);
+}
+EXPORT_SYMBOL(bio_integrity_trim);
+
+/**
+ * bio_integrity_split - Split integrity metadata
+ * @bio: Protected bio
+ * @bp: Resulting bio_pair
+ * @sectors: Offset
+ *
+ * Description: Splits an integrity page into a bio_pair.
+ */
+void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors)
+{
+ struct blk_integrity *bi;
+ struct bio_integrity_payload *bip = bio->bi_integrity;
+ unsigned int nr_sectors;
+
+ if (bio_integrity(bio) == 0)
+ return;
+
+ bi = bdev_get_integrity(bio->bi_bdev);
+ BUG_ON(bi == NULL);
+ BUG_ON(bip->bip_vcnt != 1);
+
+ nr_sectors = bio_integrity_hw_sectors(bi, sectors);
+
+ bp->bio1.bi_integrity = &bp->bip1;
+ bp->bio2.bi_integrity = &bp->bip2;
+
+ bp->iv1 = bip->bip_vec[0];
+ bp->iv2 = bip->bip_vec[0];
+
+ bp->bip1.bip_vec = &bp->iv1;
+ bp->bip2.bip_vec = &bp->iv2;
+
+ bp->iv1.bv_len = sectors * bi->tuple_size;
+ bp->iv2.bv_offset += sectors * bi->tuple_size;
+ bp->iv2.bv_len -= sectors * bi->tuple_size;
+
+ bp->bip1.bip_sector = bio->bi_integrity->bip_sector;
+ bp->bip2.bip_sector = bio->bi_integrity->bip_sector + nr_sectors;
+
+ bp->bip1.bip_vcnt = bp->bip2.bip_vcnt = 1;
+ bp->bip1.bip_idx = bp->bip2.bip_idx = 0;
+}
+EXPORT_SYMBOL(bio_integrity_split);
+
+/**
+ * bio_integrity_clone - Callback for cloning bios with integrity metadata
+ * @bio: New bio
+ * @bio_src: Original bio
+ * @bs: bio_set to allocate bip from
+ *
+ * Description: Called to allocate a bip when cloning a bio
+ */
+int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
+ struct bio_set *bs)
+{
+ struct bio_integrity_payload *bip_src = bio_src->bi_integrity;
+ struct bio_integrity_payload *bip;
+
+ BUG_ON(bip_src == NULL);
+
+ bip = bio_integrity_alloc_bioset(bio, GFP_NOIO, bip_src->bip_vcnt, bs);
+
+ if (bip == NULL)
+ return -EIO;
+
+ memcpy(bip->bip_vec, bip_src->bip_vec,
+ bip_src->bip_vcnt * sizeof(struct bio_vec));
+
+ bip->bip_sector = bip_src->bip_sector;
+ bip->bip_vcnt = bip_src->bip_vcnt;
+ bip->bip_idx = bip_src->bip_idx;
+
+ return 0;
+}
+EXPORT_SYMBOL(bio_integrity_clone);
+
+int bioset_integrity_create(struct bio_set *bs, int pool_size)
+{
+ bs->bio_integrity_pool = mempool_create_slab_pool(pool_size,
+ bio_integrity_slab);
+ if (!bs->bio_integrity_pool)
+ return -1;
+
+ return 0;
+}
+EXPORT_SYMBOL(bioset_integrity_create);
+
+void bioset_integrity_free(struct bio_set *bs)
+{
+ if (bs->bio_integrity_pool)
+ mempool_destroy(bs->bio_integrity_pool);
+}
+EXPORT_SYMBOL(bioset_integrity_free);
+
+void __init bio_integrity_init_slab(void)
+{
+ bio_integrity_slab = KMEM_CACHE(bio_integrity_payload,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+}
+
+static int __init integrity_init(void)
+{
+ kintegrityd_wq = create_workqueue("kintegrityd");
+
+ if (!kintegrityd_wq)
+ panic("Failed to create kintegrityd\n");
+
+ return 0;
+}
+subsys_initcall(integrity_init);
diff --git a/fs/bio.c b/fs/bio.c
index 78562574cb5..77a55bcceed 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -28,24 +28,9 @@
#include <linux/blktrace_api.h>
#include <scsi/sg.h> /* for struct sg_iovec */
-#define BIO_POOL_SIZE 2
-
static struct kmem_cache *bio_slab __read_mostly;
-#define BIOVEC_NR_POOLS 6
-
-/*
- * a small number of entries is fine, not going to be performance critical.
- * basically we just need to survive
- */
-#define BIO_SPLIT_ENTRIES 2
-mempool_t *bio_split_pool __read_mostly;
-
-struct biovec_slab {
- int nr_vecs;
- char *name;
- struct kmem_cache *slab;
-};
+static mempool_t *bio_split_pool __read_mostly;
/*
* if you change this list, also change bvec_alloc or things will
@@ -60,49 +45,61 @@ static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
#undef BV
/*
- * bio_set is used to allow other portions of the IO system to
- * allocate their own private memory pools for bio and iovec structures.
- * These memory pools in turn all allocate from the bio_slab
- * and the bvec_slabs[].
- */
-struct bio_set {
- mempool_t *bio_pool;
- mempool_t *bvec_pools[BIOVEC_NR_POOLS];
-};
-
-/*
* fs_bio_set is the bio_set containing bio and iovec memory pools used by
* IO code that does not need private memory pools.
*/
-static struct bio_set *fs_bio_set;
+struct bio_set *fs_bio_set;
+
+unsigned int bvec_nr_vecs(unsigned short idx)
+{
+ return bvec_slabs[idx].nr_vecs;
+}
-static inline struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
+struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
{
struct bio_vec *bvl;
/*
- * see comment near bvec_array define!
+ * If 'bs' is given, lookup the pool and do the mempool alloc.
+ * If not, this is a bio_kmalloc() allocation and just do a
+ * kzalloc() for the exact number of vecs right away.
*/
- switch (nr) {
- case 1 : *idx = 0; break;
- case 2 ... 4: *idx = 1; break;
- case 5 ... 16: *idx = 2; break;
- case 17 ... 64: *idx = 3; break;
- case 65 ... 128: *idx = 4; break;
- case 129 ... BIO_MAX_PAGES: *idx = 5; break;
+ if (bs) {
+ /*
+ * see comment near bvec_array define!
+ */
+ switch (nr) {
+ case 1:
+ *idx = 0;
+ break;
+ case 2 ... 4:
+ *idx = 1;
+ break;
+ case 5 ... 16:
+ *idx = 2;
+ break;
+ case 17 ... 64:
+ *idx = 3;
+ break;
+ case 65 ... 128:
+ *idx = 4;
+ break;
+ case 129 ... BIO_MAX_PAGES:
+ *idx = 5;
+ break;
default:
return NULL;
- }
- /*
- * idx now points to the pool we want to allocate from
- */
-
- bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask);
- if (bvl) {
- struct biovec_slab *bp = bvec_slabs + *idx;
+ }
- memset(bvl, 0, bp->nr_vecs * sizeof(struct bio_vec));
- }
+ /*
+ * idx now points to the pool we want to allocate from
+ */
+ bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask);
+ if (bvl)
+ memset(bvl, 0,
+ bvec_nr_vecs(*idx) * sizeof(struct bio_vec));
+ } else
+ bvl = kzalloc(nr * sizeof(struct bio_vec), gfp_mask);
return bvl;
}
@@ -117,6 +114,9 @@ void bio_free(struct bio *bio, struct bio_set *bio_set)
mempool_free(bio->bi_io_vec, bio_set->bvec_pools[pool_idx]);
}
+ if (bio_integrity(bio))
+ bio_integrity_free(bio, bio_set);
+
mempool_free(bio, bio_set->bio_pool);
}
@@ -128,10 +128,17 @@ static void bio_fs_destructor(struct bio *bio)
bio_free(bio, fs_bio_set);
}
+static void bio_kmalloc_destructor(struct bio *bio)
+{
+ kfree(bio->bi_io_vec);
+ kfree(bio);
+}
+
void bio_init(struct bio *bio)
{
memset(bio, 0, sizeof(*bio));
bio->bi_flags = 1 << BIO_UPTODATE;
+ bio->bi_comp_cpu = -1;
atomic_set(&bio->bi_cnt, 1);
}
@@ -139,19 +146,25 @@ void bio_init(struct bio *bio)
* bio_alloc_bioset - allocate a bio for I/O
* @gfp_mask: the GFP_ mask given to the slab allocator
* @nr_iovecs: number of iovecs to pre-allocate
- * @bs: the bio_set to allocate from
+ * @bs: the bio_set to allocate from. If %NULL, just use kmalloc
*
* Description:
- * bio_alloc_bioset will first try it's on mempool to satisfy the allocation.
+ * bio_alloc_bioset will first try its own mempool to satisfy the allocation.
* If %__GFP_WAIT is set then we will block on the internal pool waiting
- * for a &struct bio to become free.
+ * for a &struct bio to become free. If a %NULL @bs is passed in, we will
+ * fall back to just using @kmalloc to allocate the required memory.
*
* allocate bio and iovecs from the memory pools specified by the
- * bio_set structure.
+ * bio_set structure, or @kmalloc if none given.
**/
struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
{
- struct bio *bio = mempool_alloc(bs->bio_pool, gfp_mask);
+ struct bio *bio;
+
+ if (bs)
+ bio = mempool_alloc(bs->bio_pool, gfp_mask);
+ else
+ bio = kmalloc(sizeof(*bio), gfp_mask);
if (likely(bio)) {
struct bio_vec *bvl = NULL;
@@ -162,12 +175,15 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
if (unlikely(!bvl)) {
- mempool_free(bio, bs->bio_pool);
+ if (bs)
+ mempool_free(bio, bs->bio_pool);
+ else
+ kfree(bio);
bio = NULL;
goto out;
}
bio->bi_flags |= idx << BIO_POOL_OFFSET;
- bio->bi_max_vecs = bvec_slabs[idx].nr_vecs;
+ bio->bi_max_vecs = bvec_nr_vecs(idx);
}
bio->bi_io_vec = bvl;
}
@@ -185,6 +201,23 @@ struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
return bio;
}
+/*
+ * Like bio_alloc(), but doesn't use a mempool backing. This means that
+ * it CAN fail, but while bio_alloc() can only be used for allocations
+ * that have a short (finite) life span, bio_kmalloc() should be used
+ * for more permanent bio allocations (like allocating some bio's for
+ * initalization or setup purposes).
+ */
+struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs)
+{
+ struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, NULL);
+
+ if (bio)
+ bio->bi_destructor = bio_kmalloc_destructor;
+
+ return bio;
+}
+
void zero_fill_bio(struct bio *bio)
{
unsigned long flags;
@@ -229,14 +262,6 @@ inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
return bio->bi_phys_segments;
}
-inline int bio_hw_segments(struct request_queue *q, struct bio *bio)
-{
- if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
- blk_recount_segments(q, bio);
-
- return bio->bi_hw_segments;
-}
-
/**
* __bio_clone - clone a bio
* @bio: destination bio
@@ -275,9 +300,19 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
{
struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set);
- if (b) {
- b->bi_destructor = bio_fs_destructor;
- __bio_clone(b, bio);
+ if (!b)
+ return NULL;
+
+ b->bi_destructor = bio_fs_destructor;
+ __bio_clone(b, bio);
+
+ if (bio_integrity(bio)) {
+ int ret;
+
+ ret = bio_integrity_clone(b, bio, fs_bio_set);
+
+ if (ret < 0)
+ return NULL;
}
return b;
@@ -333,10 +368,19 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
if (page == prev->bv_page &&
offset == prev->bv_offset + prev->bv_len) {
prev->bv_len += len;
- if (q->merge_bvec_fn &&
- q->merge_bvec_fn(q, bio, prev) < len) {
- prev->bv_len -= len;
- return 0;
+
+ if (q->merge_bvec_fn) {
+ struct bvec_merge_data bvm = {
+ .bi_bdev = bio->bi_bdev,
+ .bi_sector = bio->bi_sector,
+ .bi_size = bio->bi_size,
+ .bi_rw = bio->bi_rw,
+ };
+
+ if (q->merge_bvec_fn(q, &bvm, prev) < len) {
+ prev->bv_len -= len;
+ return 0;
+ }
}
goto done;
@@ -352,8 +396,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
*/
while (bio->bi_phys_segments >= q->max_phys_segments
- || bio->bi_hw_segments >= q->max_hw_segments
- || BIOVEC_VIRT_OVERSIZE(bio->bi_size)) {
+ || bio->bi_phys_segments >= q->max_hw_segments) {
if (retried_segments)
return 0;
@@ -377,11 +420,18 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
* queue to get further control
*/
if (q->merge_bvec_fn) {
+ struct bvec_merge_data bvm = {
+ .bi_bdev = bio->bi_bdev,
+ .bi_sector = bio->bi_sector,
+ .bi_size = bio->bi_size,
+ .bi_rw = bio->bi_rw,
+ };
+
/*
* merge_bvec_fn() returns number of bytes it can accept
* at this offset
*/
- if (q->merge_bvec_fn(q, bio, bvec) < len) {
+ if (q->merge_bvec_fn(q, &bvm, bvec) < len) {
bvec->bv_page = NULL;
bvec->bv_len = 0;
bvec->bv_offset = 0;
@@ -390,13 +440,11 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
}
/* If we may be able to merge these biovecs, force a recount */
- if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec) ||
- BIOVEC_VIRT_MERGEABLE(bvec-1, bvec)))
+ if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
bio->bi_flags &= ~(1 << BIO_SEG_VALID);
bio->bi_vcnt++;
bio->bi_phys_segments++;
- bio->bi_hw_segments++;
done:
bio->bi_size += len;
return len;
@@ -444,16 +492,19 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
struct bio_map_data {
struct bio_vec *iovecs;
- int nr_sgvecs;
struct sg_iovec *sgvecs;
+ int nr_sgvecs;
+ int is_our_pages;
};
static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
- struct sg_iovec *iov, int iov_count)
+ struct sg_iovec *iov, int iov_count,
+ int is_our_pages)
{
memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt);
memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count);
bmd->nr_sgvecs = iov_count;
+ bmd->is_our_pages = is_our_pages;
bio->bi_private = bmd;
}
@@ -464,20 +515,21 @@ static void bio_free_map_data(struct bio_map_data *bmd)
kfree(bmd);
}
-static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count)
+static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count,
+ gfp_t gfp_mask)
{
- struct bio_map_data *bmd = kmalloc(sizeof(*bmd), GFP_KERNEL);
+ struct bio_map_data *bmd = kmalloc(sizeof(*bmd), gfp_mask);
if (!bmd)
return NULL;
- bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, GFP_KERNEL);
+ bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, gfp_mask);
if (!bmd->iovecs) {
kfree(bmd);
return NULL;
}
- bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, GFP_KERNEL);
+ bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, gfp_mask);
if (bmd->sgvecs)
return bmd;
@@ -486,8 +538,9 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count)
return NULL;
}
-static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count,
- int uncopy)
+static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
+ struct sg_iovec *iov, int iov_count, int uncopy,
+ int do_free_page)
{
int ret = 0, i;
struct bio_vec *bvec;
@@ -497,7 +550,7 @@ static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count,
__bio_for_each_segment(bvec, bio, i, 0) {
char *bv_addr = page_address(bvec->bv_page);
- unsigned int bv_len = bvec->bv_len;
+ unsigned int bv_len = iovecs[i].bv_len;
while (bv_len && iov_idx < iov_count) {
unsigned int bytes;
@@ -530,7 +583,7 @@ static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count,
}
}
- if (uncopy)
+ if (do_free_page)
__free_page(bvec->bv_page);
}
@@ -547,10 +600,11 @@ static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count,
int bio_uncopy_user(struct bio *bio)
{
struct bio_map_data *bmd = bio->bi_private;
- int ret;
-
- ret = __bio_copy_iov(bio, bmd->sgvecs, bmd->nr_sgvecs, 1);
+ int ret = 0;
+ if (!bio_flagged(bio, BIO_NULL_MAPPED))
+ ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs,
+ bmd->nr_sgvecs, 1, bmd->is_our_pages);
bio_free_map_data(bmd);
bio_put(bio);
return ret;
@@ -559,16 +613,20 @@ int bio_uncopy_user(struct bio *bio)
/**
* bio_copy_user_iov - copy user data to bio
* @q: destination block queue
+ * @map_data: pointer to the rq_map_data holding pages (if necessary)
* @iov: the iovec.
* @iov_count: number of elements in the iovec
* @write_to_vm: bool indicating writing to pages or not
+ * @gfp_mask: memory allocation flags
*
* Prepares and returns a bio for indirect user io, bouncing data
* to/from kernel pages as necessary. Must be paired with
* call bio_uncopy_user() on io completion.
*/
-struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov,
- int iov_count, int write_to_vm)
+struct bio *bio_copy_user_iov(struct request_queue *q,
+ struct rq_map_data *map_data,
+ struct sg_iovec *iov, int iov_count,
+ int write_to_vm, gfp_t gfp_mask)
{
struct bio_map_data *bmd;
struct bio_vec *bvec;
@@ -591,25 +649,38 @@ struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov,
len += iov[i].iov_len;
}
- bmd = bio_alloc_map_data(nr_pages, iov_count);
+ bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask);
if (!bmd)
return ERR_PTR(-ENOMEM);
ret = -ENOMEM;
- bio = bio_alloc(GFP_KERNEL, nr_pages);
+ bio = bio_alloc(gfp_mask, nr_pages);
if (!bio)
goto out_bmd;
bio->bi_rw |= (!write_to_vm << BIO_RW);
ret = 0;
+ i = 0;
while (len) {
- unsigned int bytes = PAGE_SIZE;
+ unsigned int bytes;
+
+ if (map_data)
+ bytes = 1U << (PAGE_SHIFT + map_data->page_order);
+ else
+ bytes = PAGE_SIZE;
if (bytes > len)
bytes = len;
- page = alloc_page(q->bounce_gfp | GFP_KERNEL);
+ if (map_data) {
+ if (i == map_data->nr_entries) {
+ ret = -ENOMEM;
+ break;
+ }
+ page = map_data->pages[i++];
+ } else
+ page = alloc_page(q->bounce_gfp | gfp_mask);
if (!page) {
ret = -ENOMEM;
break;
@@ -628,16 +699,17 @@ struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov,
* success
*/
if (!write_to_vm) {
- ret = __bio_copy_iov(bio, iov, iov_count, 0);
+ ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 0);
if (ret)
goto cleanup;
}
- bio_set_map_data(bmd, bio, iov, iov_count);
+ bio_set_map_data(bmd, bio, iov, iov_count, map_data ? 0 : 1);
return bio;
cleanup:
- bio_for_each_segment(bvec, bio, i)
- __free_page(bvec->bv_page);
+ if (!map_data)
+ bio_for_each_segment(bvec, bio, i)
+ __free_page(bvec->bv_page);
bio_put(bio);
out_bmd:
@@ -648,29 +720,32 @@ out_bmd:
/**
* bio_copy_user - copy user data to bio
* @q: destination block queue
+ * @map_data: pointer to the rq_map_data holding pages (if necessary)
* @uaddr: start of user address
* @len: length in bytes
* @write_to_vm: bool indicating writing to pages or not
+ * @gfp_mask: memory allocation flags
*
* Prepares and returns a bio for indirect user io, bouncing data
* to/from kernel pages as necessary. Must be paired with
* call bio_uncopy_user() on io completion.
*/
-struct bio *bio_copy_user(struct request_queue *q, unsigned long uaddr,
- unsigned int len, int write_to_vm)
+struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data,
+ unsigned long uaddr, unsigned int len,
+ int write_to_vm, gfp_t gfp_mask)
{
struct sg_iovec iov;
iov.iov_base = (void __user *)uaddr;
iov.iov_len = len;
- return bio_copy_user_iov(q, &iov, 1, write_to_vm);
+ return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask);
}
static struct bio *__bio_map_user_iov(struct request_queue *q,
struct block_device *bdev,
struct sg_iovec *iov, int iov_count,
- int write_to_vm)
+ int write_to_vm, gfp_t gfp_mask)
{
int i, j;
int nr_pages = 0;
@@ -696,12 +771,12 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
if (!nr_pages)
return ERR_PTR(-EINVAL);
- bio = bio_alloc(GFP_KERNEL, nr_pages);
+ bio = bio_alloc(gfp_mask, nr_pages);
if (!bio)
return ERR_PTR(-ENOMEM);
ret = -ENOMEM;
- pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
+ pages = kcalloc(nr_pages, sizeof(struct page *), gfp_mask);
if (!pages)
goto out;
@@ -713,12 +788,8 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
const int local_nr_pages = end - start;
const int page_limit = cur_page + local_nr_pages;
- down_read(&current->mm->mmap_sem);
- ret = get_user_pages(current, current->mm, uaddr,
- local_nr_pages,
- write_to_vm, 0, &pages[cur_page], NULL);
- up_read(&current->mm->mmap_sem);
-
+ ret = get_user_pages_fast(uaddr, local_nr_pages,
+ write_to_vm, &pages[cur_page]);
if (ret < local_nr_pages) {
ret = -EFAULT;
goto out_unmap;
@@ -784,19 +855,21 @@ static struct bio *__bio_map_user_iov(struct request_queue *q,
* @uaddr: start of user address
* @len: length in bytes
* @write_to_vm: bool indicating writing to pages or not
+ * @gfp_mask: memory allocation flags
*
* Map the user space address into a bio suitable for io to a block
* device. Returns an error pointer in case of error.
*/
struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev,
- unsigned long uaddr, unsigned int len, int write_to_vm)
+ unsigned long uaddr, unsigned int len, int write_to_vm,
+ gfp_t gfp_mask)
{
struct sg_iovec iov;
iov.iov_base = (void __user *)uaddr;
iov.iov_len = len;
- return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm);
+ return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask);
}
/**
@@ -806,18 +879,19 @@ struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev,
* @iov: the iovec.
* @iov_count: number of elements in the iovec
* @write_to_vm: bool indicating writing to pages or not
+ * @gfp_mask: memory allocation flags
*
* Map the user space address into a bio suitable for io to a block
* device. Returns an error pointer in case of error.
*/
struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev,
struct sg_iovec *iov, int iov_count,
- int write_to_vm)
+ int write_to_vm, gfp_t gfp_mask)
{
struct bio *bio;
- bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm);
-
+ bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm,
+ gfp_mask);
if (IS_ERR(bio))
return bio;
@@ -941,19 +1015,22 @@ static void bio_copy_kern_endio(struct bio *bio, int err)
{
struct bio_vec *bvec;
const int read = bio_data_dir(bio) == READ;
- char *p = bio->bi_private;
+ struct bio_map_data *bmd = bio->bi_private;
int i;
+ char *p = bmd->sgvecs[0].iov_base;
__bio_for_each_segment(bvec, bio, i, 0) {
char *addr = page_address(bvec->bv_page);
+ int len = bmd->iovecs[i].bv_len;
if (read && !err)
- memcpy(p, addr, bvec->bv_len);
+ memcpy(p, addr, len);
__free_page(bvec->bv_page);
- p += bvec->bv_len;
+ p += len;
}
+ bio_free_map_data(bmd);
bio_put(bio);
}
@@ -971,38 +1048,13 @@ static void bio_copy_kern_endio(struct bio *bio, int err)
struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
gfp_t gfp_mask, int reading)
{
- unsigned long kaddr = (unsigned long)data;
- unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
- unsigned long start = kaddr >> PAGE_SHIFT;
- const int nr_pages = end - start;
struct bio *bio;
struct bio_vec *bvec;
- int i, ret;
-
- bio = bio_alloc(gfp_mask, nr_pages);
- if (!bio)
- return ERR_PTR(-ENOMEM);
-
- while (len) {
- struct page *page;
- unsigned int bytes = PAGE_SIZE;
-
- if (bytes > len)
- bytes = len;
-
- page = alloc_page(q->bounce_gfp | gfp_mask);
- if (!page) {
- ret = -ENOMEM;
- goto cleanup;
- }
-
- if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) {
- ret = -EINVAL;
- goto cleanup;
- }
+ int i;
- len -= bytes;
- }
+ bio = bio_copy_user(q, NULL, (unsigned long)data, len, 1, gfp_mask);
+ if (IS_ERR(bio))
+ return bio;
if (!reading) {
void *p = data;
@@ -1015,16 +1067,9 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
}
}
- bio->bi_private = data;
bio->bi_end_io = bio_copy_kern_endio;
- return bio;
-cleanup:
- bio_for_each_segment(bvec, bio, i)
- __free_page(bvec->bv_page);
-
- bio_put(bio);
- return ERR_PTR(ret);
+ return bio;
}
/*
@@ -1211,9 +1256,9 @@ static void bio_pair_end_2(struct bio *bi, int err)
* split a bio - only worry about a bio with a single page
* in it's iovec
*/
-struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
+struct bio_pair *bio_split(struct bio *bi, int first_sectors)
{
- struct bio_pair *bp = mempool_alloc(pool, GFP_NOIO);
+ struct bio_pair *bp = mempool_alloc(bio_split_pool, GFP_NOIO);
if (!bp)
return bp;
@@ -1247,11 +1292,50 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
bp->bio2.bi_end_io = bio_pair_end_2;
bp->bio1.bi_private = bi;
- bp->bio2.bi_private = pool;
+ bp->bio2.bi_private = bio_split_pool;
+
+ if (bio_integrity(bi))
+ bio_integrity_split(bi, bp, first_sectors);
return bp;
}
+/**
+ * bio_sector_offset - Find hardware sector offset in bio
+ * @bio: bio to inspect
+ * @index: bio_vec index
+ * @offset: offset in bv_page
+ *
+ * Return the number of hardware sectors between beginning of bio
+ * and an end point indicated by a bio_vec index and an offset
+ * within that vector's page.
+ */
+sector_t bio_sector_offset(struct bio *bio, unsigned short index,
+ unsigned int offset)
+{
+ unsigned int sector_sz = queue_hardsect_size(bio->bi_bdev->bd_disk->queue);
+ struct bio_vec *bv;
+ sector_t sectors;
+ int i;
+
+ sectors = 0;
+
+ if (index >= bio->bi_idx)
+ index = bio->bi_vcnt - 1;
+
+ __bio_for_each_segment(bv, bio, i, 0) {
+ if (i == index) {
+ if (offset > bv->bv_offset)
+ sectors += (offset - bv->bv_offset) / sector_sz;
+ break;
+ }
+
+ sectors += bv->bv_len / sector_sz;
+ }
+
+ return sectors;
+}
+EXPORT_SYMBOL(bio_sector_offset);
/*
* create memory pools for biovec's in a bio_set.
@@ -1290,6 +1374,7 @@ void bioset_free(struct bio_set *bs)
if (bs->bio_pool)
mempool_destroy(bs->bio_pool);
+ bioset_integrity_free(bs);
biovec_free_pools(bs);
kfree(bs);
@@ -1306,6 +1391,9 @@ struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size)
if (!bs->bio_pool)
goto bad;
+ if (bioset_integrity_create(bs, bio_pool_size))
+ goto bad;
+
if (!biovec_create_pools(bs, bvec_pool_size))
return bs;
@@ -1332,6 +1420,7 @@ static int __init init_bio(void)
{
bio_slab = KMEM_CACHE(bio, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+ bio_integrity_init_slab();
biovec_init_slabs();
fs_bio_set = bioset_create(BIO_POOL_SIZE, 2);
@@ -1349,6 +1438,7 @@ static int __init init_bio(void)
subsys_initcall(init_bio);
EXPORT_SYMBOL(bio_alloc);
+EXPORT_SYMBOL(bio_kmalloc);
EXPORT_SYMBOL(bio_put);
EXPORT_SYMBOL(bio_free);
EXPORT_SYMBOL(bio_endio);
@@ -1356,7 +1446,6 @@ EXPORT_SYMBOL(bio_init);
EXPORT_SYMBOL(__bio_clone);
EXPORT_SYMBOL(bio_clone);
EXPORT_SYMBOL(bio_phys_segments);
-EXPORT_SYMBOL(bio_hw_segments);
EXPORT_SYMBOL(bio_add_page);
EXPORT_SYMBOL(bio_add_pc_page);
EXPORT_SYMBOL(bio_get_nr_vecs);
@@ -1366,7 +1455,6 @@ EXPORT_SYMBOL(bio_map_kern);
EXPORT_SYMBOL(bio_copy_kern);
EXPORT_SYMBOL(bio_pair_release);
EXPORT_SYMBOL(bio_split);
-EXPORT_SYMBOL(bio_split_pool);
EXPORT_SYMBOL(bio_copy_user);
EXPORT_SYMBOL(bio_uncopy_user);
EXPORT_SYMBOL(bioset_create);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 470c10ceb0f..d84f0469a01 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -271,7 +271,7 @@ static void bdev_destroy_inode(struct inode *inode)
kmem_cache_free(bdev_cachep, bdi);
}
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
{
struct bdev_inode *ei = (struct bdev_inode *) foo;
struct block_device *bdev = &ei->bdev;
@@ -540,22 +540,6 @@ EXPORT_SYMBOL(bd_release);
* /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
*/
-static struct kobject *bdev_get_kobj(struct block_device *bdev)
-{
- if (bdev->bd_contains != bdev)
- return kobject_get(&bdev->bd_part->dev.kobj);
- else
- return kobject_get(&bdev->bd_disk->dev.kobj);
-}
-
-static struct kobject *bdev_get_holder(struct block_device *bdev)
-{
- if (bdev->bd_contains != bdev)
- return kobject_get(bdev->bd_part->holder_dir);
- else
- return kobject_get(bdev->bd_disk->holder_dir);
-}
-
static int add_symlink(struct kobject *from, struct kobject *to)
{
if (!from || !to)
@@ -604,11 +588,11 @@ static int bd_holder_grab_dirs(struct block_device *bdev,
if (!bo->hdev)
goto fail_put_sdir;
- bo->sdev = bdev_get_kobj(bdev);
+ bo->sdev = kobject_get(&part_to_dev(bdev->bd_part)->kobj);
if (!bo->sdev)
goto fail_put_hdev;
- bo->hdir = bdev_get_holder(bdev);
+ bo->hdir = kobject_get(bdev->bd_part->holder_dir);
if (!bo->hdir)
goto fail_put_sdev;
@@ -868,6 +852,87 @@ struct block_device *open_by_devnum(dev_t dev, unsigned mode)
EXPORT_SYMBOL(open_by_devnum);
+/**
+ * flush_disk - invalidates all buffer-cache entries on a disk
+ *
+ * @bdev: struct block device to be flushed
+ *
+ * Invalidates all buffer-cache entries on a disk. It should be called
+ * when a disk has been changed -- either by a media change or online
+ * resize.
+ */
+static void flush_disk(struct block_device *bdev)
+{
+ if (__invalidate_device(bdev)) {
+ char name[BDEVNAME_SIZE] = "";
+
+ if (bdev->bd_disk)
+ disk_name(bdev->bd_disk, 0, name);
+ printk(KERN_WARNING "VFS: busy inodes on changed media or "
+ "resized disk %s\n", name);
+ }
+
+ if (!bdev->bd_disk)
+ return;
+ if (disk_partitionable(bdev->bd_disk))
+ bdev->bd_invalidated = 1;
+}
+
+/**
+ * check_disk_size_change - checks for disk size change and adjusts bdev size.
+ * @disk: struct gendisk to check
+ * @bdev: struct bdev to adjust.
+ *
+ * This routine checks to see if the bdev size does not match the disk size
+ * and adjusts it if it differs.
+ */
+void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
+{
+ loff_t disk_size, bdev_size;
+
+ disk_size = (loff_t)get_capacity(disk) << 9;
+ bdev_size = i_size_read(bdev->bd_inode);
+ if (disk_size != bdev_size) {
+ char name[BDEVNAME_SIZE];
+
+ disk_name(disk, 0, name);
+ printk(KERN_INFO
+ "%s: detected capacity change from %lld to %lld\n",
+ name, bdev_size, disk_size);
+ i_size_write(bdev->bd_inode, disk_size);
+ flush_disk(bdev);
+ }
+}
+EXPORT_SYMBOL(check_disk_size_change);
+
+/**
+ * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back
+ * @disk: struct gendisk to be revalidated
+ *
+ * This routine is a wrapper for lower-level driver's revalidate_disk
+ * call-backs. It is used to do common pre and post operations needed
+ * for all revalidate_disk operations.
+ */
+int revalidate_disk(struct gendisk *disk)
+{
+ struct block_device *bdev;
+ int ret = 0;
+
+ if (disk->fops->revalidate_disk)
+ ret = disk->fops->revalidate_disk(disk);
+
+ bdev = bdget_disk(disk, 0);
+ if (!bdev)
+ return ret;
+
+ mutex_lock(&bdev->bd_mutex);
+ check_disk_size_change(disk, bdev);
+ mutex_unlock(&bdev->bd_mutex);
+ bdput(bdev);
+ return ret;
+}
+EXPORT_SYMBOL(revalidate_disk);
+
/*
* This routine checks whether a removable media has been changed,
* and invalidates all buffer-cache-entries in that case. This
@@ -887,13 +952,9 @@ int check_disk_change(struct block_device *bdev)
if (!bdops->media_changed(bdev->bd_disk))
return 0;
- if (__invalidate_device(bdev))
- printk("VFS: busy inodes on changed media.\n");
-
+ flush_disk(bdev);
if (bdops->revalidate_disk)
bdops->revalidate_disk(bdev->bd_disk);
- if (bdev->bd_disk->minors > 1)
- bdev->bd_invalidated = 1;
return 1;
}
@@ -927,36 +988,48 @@ static int __blkdev_put(struct block_device *bdev, int for_part);
static int do_open(struct block_device *bdev, struct file *file, int for_part)
{
- struct module *owner = NULL;
struct gendisk *disk;
+ struct hd_struct *part = NULL;
int ret;
- int part;
+ int partno;
+ int perm = 0;
- ret = devcgroup_inode_permission(bdev->bd_inode, file->f_mode);
- if (ret != 0)
+ if (file->f_mode & FMODE_READ)
+ perm |= MAY_READ;
+ if (file->f_mode & FMODE_WRITE)
+ perm |= MAY_WRITE;
+ /*
+ * hooks: /n/, see "layering violations".
+ */
+ ret = devcgroup_inode_permission(bdev->bd_inode, perm);
+ if (ret != 0) {
+ bdput(bdev);
return ret;
+ }
ret = -ENXIO;
file->f_mapping = bdev->bd_inode->i_mapping;
+
lock_kernel();
- disk = get_gendisk(bdev->bd_dev, &part);
- if (!disk) {
- unlock_kernel();
- bdput(bdev);
- return ret;
- }
- owner = disk->fops->owner;
+
+ disk = get_gendisk(bdev->bd_dev, &partno);
+ if (!disk)
+ goto out_unlock_kernel;
+ part = disk_get_part(disk, partno);
+ if (!part)
+ goto out_unlock_kernel;
mutex_lock_nested(&bdev->bd_mutex, for_part);
if (!bdev->bd_openers) {
bdev->bd_disk = disk;
+ bdev->bd_part = part;
bdev->bd_contains = bdev;
- if (!part) {
+ if (!partno) {
struct backing_dev_info *bdi;
if (disk->fops->open) {
ret = disk->fops->open(bdev->bd_inode, file);
if (ret)
- goto out_first;
+ goto out_clear;
}
if (!bdev->bd_openers) {
bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
@@ -968,36 +1041,36 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
if (bdev->bd_invalidated)
rescan_partitions(disk, bdev);
} else {
- struct hd_struct *p;
struct block_device *whole;
whole = bdget_disk(disk, 0);
ret = -ENOMEM;
if (!whole)
- goto out_first;
+ goto out_clear;
BUG_ON(for_part);
ret = __blkdev_get(whole, file->f_mode, file->f_flags, 1);
if (ret)
- goto out_first;
+ goto out_clear;
bdev->bd_contains = whole;
- p = disk->part[part - 1];
bdev->bd_inode->i_data.backing_dev_info =
whole->bd_inode->i_data.backing_dev_info;
- if (!(disk->flags & GENHD_FL_UP) || !p || !p->nr_sects) {
+ if (!(disk->flags & GENHD_FL_UP) ||
+ !part || !part->nr_sects) {
ret = -ENXIO;
- goto out_first;
+ goto out_clear;
}
- kobject_get(&p->dev.kobj);
- bdev->bd_part = p;
- bd_set_size(bdev, (loff_t) p->nr_sects << 9);
+ bd_set_size(bdev, (loff_t)part->nr_sects << 9);
}
} else {
+ disk_put_part(part);
put_disk(disk);
- module_put(owner);
+ module_put(disk->fops->owner);
+ part = NULL;
+ disk = NULL;
if (bdev->bd_contains == bdev) {
if (bdev->bd_disk->fops->open) {
ret = bdev->bd_disk->fops->open(bdev->bd_inode, file);
if (ret)
- goto out;
+ goto out_unlock_bdev;
}
if (bdev->bd_invalidated)
rescan_partitions(bdev->bd_disk, bdev);
@@ -1010,19 +1083,24 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
unlock_kernel();
return 0;
-out_first:
+ out_clear:
bdev->bd_disk = NULL;
+ bdev->bd_part = NULL;
bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
if (bdev != bdev->bd_contains)
__blkdev_put(bdev->bd_contains, 1);
bdev->bd_contains = NULL;
- put_disk(disk);
- module_put(owner);
-out:
+ out_unlock_bdev:
mutex_unlock(&bdev->bd_mutex);
+ out_unlock_kernel:
unlock_kernel();
- if (ret)
- bdput(bdev);
+
+ disk_put_part(part);
+ if (disk)
+ module_put(disk->fops->owner);
+ put_disk(disk);
+ bdput(bdev);
+
return ret;
}
@@ -1107,11 +1185,8 @@ static int __blkdev_put(struct block_device *bdev, int for_part)
put_disk(disk);
module_put(owner);
-
- if (bdev->bd_contains != bdev) {
- kobject_put(&bdev->bd_part->dev.kobj);
- bdev->bd_part = NULL;
- }
+ disk_put_part(bdev->bd_part);
+ bdev->bd_part = NULL;
bdev->bd_disk = NULL;
bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
if (bdev != bdev->bd_contains)
@@ -1187,10 +1262,9 @@ EXPORT_SYMBOL(ioctl_by_bdev);
/**
* lookup_bdev - lookup a struct block_device by name
+ * @pathname: special file representing the block device
*
- * @path: special file representing the block device
- *
- * Get a reference to the blockdevice at @path in the current
+ * Get a reference to the blockdevice at @pathname in the current
* namespace if possible and return it. Return ERR_PTR(error)
* otherwise.
*/
@@ -1226,6 +1300,7 @@ fail:
bdev = ERR_PTR(error);
goto out;
}
+EXPORT_SYMBOL(lookup_bdev);
/**
* open_bdev_excl - open a block device by name and set it up for use
diff --git a/fs/buffer.c b/fs/buffer.c
index a073f3f4f01..ac78d4c19b3 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -580,7 +580,7 @@ EXPORT_SYMBOL(mark_buffer_async_write);
/*
* The buffer's backing address_space's private_lock must be held
*/
-static inline void __remove_assoc_queue(struct buffer_head *bh)
+static void __remove_assoc_queue(struct buffer_head *bh)
{
list_del_init(&bh->b_assoc_buffers);
WARN_ON(!bh->b_assoc_map);
@@ -706,7 +706,7 @@ static int __set_page_dirty(struct page *page,
if (TestSetPageDirty(page))
return 0;
- write_lock_irq(&mapping->tree_lock);
+ spin_lock_irq(&mapping->tree_lock);
if (page->mapping) { /* Race with truncate? */
WARN_ON_ONCE(warn && !PageUptodate(page));
@@ -719,7 +719,7 @@ static int __set_page_dirty(struct page *page,
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
- write_unlock_irq(&mapping->tree_lock);
+ spin_unlock_irq(&mapping->tree_lock);
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
return 1;
@@ -821,7 +821,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
* contents - it is a noop if I/O is still in
* flight on potentially older contents.
*/
- ll_rw_block(SWRITE, 1, &bh);
+ ll_rw_block(SWRITE_SYNC, 1, &bh);
brelse(bh);
spin_lock(lock);
}
@@ -1214,8 +1214,7 @@ void __brelse(struct buffer_head * buf)
put_bh(buf);
return;
}
- printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
- WARN_ON(1);
+ WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
}
/*
@@ -1464,7 +1463,7 @@ static void invalidate_bh_lru(void *arg)
void invalidate_bh_lrus(void)
{
- on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
+ on_each_cpu(invalidate_bh_lru, NULL, 1);
}
EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
@@ -1691,11 +1690,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
*/
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
- } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
+ } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
+ buffer_dirty(bh)) {
WARN_ON(bh->b_size != blocksize);
err = get_block(inode, block, bh, 1);
if (err)
goto recover;
+ clear_buffer_delay(bh);
if (buffer_new(bh)) {
/* blockdev mappings never come here */
clear_buffer_new(bh);
@@ -1719,7 +1720,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
*/
if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
lock_buffer(bh);
- } else if (test_set_buffer_locked(bh)) {
+ } else if (!trylock_buffer(bh)) {
redirty_page_for_writepage(wbc, page);
continue;
}
@@ -1774,7 +1775,8 @@ recover:
bh = head;
/* Recovery: lock and submit the mapped buffers */
do {
- if (buffer_mapped(bh) && buffer_dirty(bh)) {
+ if (buffer_mapped(bh) && buffer_dirty(bh) &&
+ !buffer_delay(bh)) {
lock_buffer(bh);
mark_buffer_async_write(bh);
} else {
@@ -2061,6 +2063,7 @@ int generic_write_end(struct file *file, struct address_space *mapping,
struct page *page, void *fsdata)
{
struct inode *inode = mapping->host;
+ int i_size_changed = 0;
copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
@@ -2073,17 +2076,72 @@ int generic_write_end(struct file *file, struct address_space *mapping,
*/
if (pos+copied > inode->i_size) {
i_size_write(inode, pos+copied);
- mark_inode_dirty(inode);
+ i_size_changed = 1;
}
unlock_page(page);
page_cache_release(page);
+ /*
+ * Don't mark the inode dirty under page lock. First, it unnecessarily
+ * makes the holding time of page lock longer. Second, it forces lock
+ * ordering of page lock and transaction start for journaling
+ * filesystems.
+ */
+ if (i_size_changed)
+ mark_inode_dirty(inode);
+
return copied;
}
EXPORT_SYMBOL(generic_write_end);
/*
+ * block_is_partially_uptodate checks whether buffers within a page are
+ * uptodate or not.
+ *
+ * Returns true if all buffers which correspond to a file portion
+ * we want to read are uptodate.
+ */
+int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
+ unsigned long from)
+{
+ struct inode *inode = page->mapping->host;
+ unsigned block_start, block_end, blocksize;
+ unsigned to;
+ struct buffer_head *bh, *head;
+ int ret = 1;
+
+ if (!page_has_buffers(page))
+ return 0;
+
+ blocksize = 1 << inode->i_blkbits;
+ to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
+ to = from + to;
+ if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
+ return 0;
+
+ head = page_buffers(page);
+ bh = head;
+ block_start = 0;
+ do {
+ block_end = block_start + blocksize;
+ if (block_end > from && block_start < to) {
+ if (!buffer_uptodate(bh)) {
+ ret = 0;
+ break;
+ }
+ if (block_end >= to)
+ break;
+ }
+ block_start = block_end;
+ bh = bh->b_this_page;
+ } while (bh != head);
+
+ return ret;
+}
+EXPORT_SYMBOL(block_is_partially_uptodate);
+
+/*
* Generic "read page" function for block devices that have the normal
* get_block functionality. This is most of the block device filesystems.
* Reads the page asynchronously --- the unlock_buffer() and
@@ -2868,14 +2926,17 @@ int submit_bh(int rw, struct buffer_head * bh)
BUG_ON(!buffer_mapped(bh));
BUG_ON(!bh->b_end_io);
- if (buffer_ordered(bh) && (rw == WRITE))
- rw = WRITE_BARRIER;
+ /*
+ * Mask in barrier bit for a write (could be either a WRITE or a
+ * WRITE_SYNC
+ */
+ if (buffer_ordered(bh) && (rw & WRITE))
+ rw |= WRITE_BARRIER;
/*
- * Only clear out a write error when rewriting, should this
- * include WRITE_SYNC as well?
+ * Only clear out a write error when rewriting
*/
- if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER))
+ if (test_set_buffer_req(bh) && (rw & WRITE))
clear_buffer_write_io_error(bh);
/*
@@ -2940,16 +3001,19 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
for (i = 0; i < nr; i++) {
struct buffer_head *bh = bhs[i];
- if (rw == SWRITE)
+ if (rw == SWRITE || rw == SWRITE_SYNC)
lock_buffer(bh);
- else if (test_set_buffer_locked(bh))
+ else if (!trylock_buffer(bh))
continue;
- if (rw == WRITE || rw == SWRITE) {
+ if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) {
if (test_clear_buffer_dirty(bh)) {
bh->b_end_io = end_buffer_write_sync;
get_bh(bh);
- submit_bh(WRITE, bh);
+ if (rw == SWRITE_SYNC)
+ submit_bh(WRITE_SYNC, bh);
+ else
+ submit_bh(WRITE, bh);
continue;
}
} else {
@@ -2978,7 +3042,7 @@ int sync_dirty_buffer(struct buffer_head *bh)
if (test_clear_buffer_dirty(bh)) {
get_bh(bh);
bh->b_end_io = end_buffer_write_sync;
- ret = submit_bh(WRITE, bh);
+ ret = submit_bh(WRITE_SYNC, bh);
wait_on_buffer(bh);
if (buffer_eopnotsupp(bh)) {
clear_buffer_eopnotsupp(bh);
@@ -3256,7 +3320,7 @@ int bh_submit_read(struct buffer_head *bh)
EXPORT_SYMBOL(bh_submit_read);
static void
-init_buffer_head(struct kmem_cache *cachep, void *data)
+init_buffer_head(void *data)
{
struct buffer_head *bh = data;
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 68e510b8845..3cb7cda3d78 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -373,6 +373,8 @@ static int chrdev_open(struct inode *inode, struct file *filp)
return -ENXIO;
new = container_of(kobj, struct cdev, kobj);
spin_lock(&cdev_lock);
+ /* Check i_cdev again in case somebody beat us to it while
+ we dropped the lock. */
p = inode->i_cdev;
if (!p) {
inode->i_cdev = p = new;
@@ -392,11 +394,8 @@ static int chrdev_open(struct inode *inode, struct file *filp)
cdev_put(p);
return -ENXIO;
}
- if (filp->f_op->open) {
- lock_kernel();
+ if (filp->f_op->open)
ret = filp->f_op->open(inode,filp);
- unlock_kernel();
- }
if (ret)
cdev_put(p);
return ret;
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 1f3465201fd..06e521a945c 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,19 @@
+Version 1.54
+------------
+Fix premature write failure on congested networks (we would give up
+on EAGAIN from the socket too quickly on large writes).
+Cifs_mkdir and cifs_create now respect the setgid bit on parent dir.
+Fix endian problems in acl (mode from/to cifs acl) on bigendian
+architectures. Fix problems with preserving timestamps on copying open
+files (e.g. "cp -a") to Windows servers. For mkdir and create honor setgid bit
+on parent directory when server supports Unix Extensions but not POSIX
+create. Update cifs.upcall version to handle new Kerberos sec flags
+(this requires update of cifs.upcall program from Samba). Fix memory leak
+on dns_upcall (resolving DFS referralls). Fix plain text password
+authentication (requires setting SecurityFlags to 0x30030 to enable
+lanman and plain text though). Fix writes to be at correct offset when
+file is open with O_APPEND and file is on a directio (forcediretio) mount.
+
Version 1.53
------------
DFS support added (Microsoft Distributed File System client support needed
diff --git a/fs/cifs/README b/fs/cifs/README
index 2bd6fe556f8..bd2343d4c6a 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -542,10 +542,20 @@ SecurityFlags Flags which control security negotiation and
hashing mechanisms (as "must use") on the other hand
does not make much sense. Default flags are
0x07007
- (NTLM, NTLMv2 and packet signing allowed). Maximum
+ (NTLM, NTLMv2 and packet signing allowed). The maximum
allowable flags if you want to allow mounts to servers
using weaker password hashes is 0x37037 (lanman,
- plaintext, ntlm, ntlmv2, signing allowed):
+ plaintext, ntlm, ntlmv2, signing allowed). Some
+ SecurityFlags require the corresponding menuconfig
+ options to be enabled (lanman and plaintext require
+ CONFIG_CIFS_WEAK_PW_HASH for example). Enabling
+ plaintext authentication currently requires also
+ enabling lanman authentication in the security flags
+ because the cifs module only supports sending
+ laintext passwords using the older lanman dialect
+ form of the session setup SMB. (e.g. for authentication
+ using plain text passwords, set the SecurityFlags
+ to 0x30030):
may use packet signing 0x00001
must use packet signing 0x01001
@@ -642,8 +652,30 @@ The statistics for the number of total SMBs and oplock breaks are different in
that they represent all for that share, not just those for which the server
returned success.
-Also note that "cat /proc/fs/cifs/DebugData" will display information about
+Also note that "cat /proc/fs/cifs/DebugData" will display information about
the active sessions and the shares that are mounted.
-Enabling Kerberos (extended security) works when CONFIG_CIFS_EXPERIMENTAL is
-on but requires a user space helper (from the Samba project). NTLM and NTLMv2 and
-LANMAN support do not require this helper.
+
+Enabling Kerberos (extended security) works but requires version 1.2 or later
+of the helper program cifs.upcall to be present and to be configured in the
+/etc/request-key.conf file. The cifs.upcall helper program is from the Samba
+project(http://www.samba.org). NTLM and NTLMv2 and LANMAN support do not
+require this helper. Note that NTLMv2 security (which does not require the
+cifs.upcall helper program), instead of using Kerberos, is sufficient for
+some use cases.
+
+Enabling DFS support (used to access shares transparently in an MS-DFS
+global name space) requires that CONFIG_CIFS_EXPERIMENTAL be enabled. In
+addition, DFS support for target shares which are specified as UNC
+names which begin with host names (rather than IP addresses) requires
+a user space helper (such as cifs.upcall) to be present in order to
+translate host names to ip address, and the user space helper must also
+be configured in the file /etc/request-key.conf
+
+To use cifs Kerberos and DFS support, the Linux keyutils package should be
+installed and something like the following lines should be added to the
+/etc/request-key.conf file:
+
+create cifs.spnego * * /usr/local/sbin/cifs.upcall %k
+create dns_resolver * * /usr/local/sbin/cifs.upcall %k
+
+
diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index f58e41d3ba4..1b09f167006 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -400,7 +400,7 @@ asn1_oid_decode(struct asn1_ctx *ctx,
size = eoc - ctx->pointer + 1;
/* first subid actually encodes first two subids */
- if (size < 2 || size > ULONG_MAX/sizeof(unsigned long))
+ if (size < 2 || size > UINT_MAX/sizeof(unsigned long))
return 0;
*oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC);
@@ -476,6 +476,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
unsigned int cls, con, tag, oidlen, rc;
bool use_ntlmssp = false;
bool use_kerberos = false;
+ bool use_mskerberos = false;
*secType = NTLM; /* BB eventually make Kerberos or NLTMSSP the default*/
@@ -483,6 +484,7 @@ decode_negTokenInit(unsigned char *security_blob, int length,
asn1_open(&ctx, security_blob, length);
+ /* GSSAPI header */
if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
cFYI(1, ("Error decoding negTokenInit header"));
return 0;
@@ -490,156 +492,149 @@ decode_negTokenInit(unsigned char *security_blob, int length,
|| (tag != ASN1_EOC)) {
cFYI(1, ("cls = %d con = %d tag = %d", cls, con, tag));
return 0;
- } else {
- /* remember to free obj->oid */
- rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag);
- if (rc) {
- if ((tag == ASN1_OJI) && (cls == ASN1_PRI)) {
- rc = asn1_oid_decode(&ctx, end, &oid, &oidlen);
- if (rc) {
- rc = compare_oid(oid, oidlen,
- SPNEGO_OID,
- SPNEGO_OID_LEN);
- kfree(oid);
- }
- } else
- rc = 0;
- }
+ }
- if (!rc) {
- cFYI(1, ("Error decoding negTokenInit header"));
- return 0;
- }
+ /* Check for SPNEGO OID -- remember to free obj->oid */
+ rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag);
+ if (rc) {
+ if ((tag == ASN1_OJI) && (con == ASN1_PRI) &&
+ (cls == ASN1_UNI)) {
+ rc = asn1_oid_decode(&ctx, end, &oid, &oidlen);
+ if (rc) {
+ rc = compare_oid(oid, oidlen, SPNEGO_OID,
+ SPNEGO_OID_LEN);
+ kfree(oid);
+ }
+ } else
+ rc = 0;
+ }
- if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
- cFYI(1, ("Error decoding negTokenInit"));
- return 0;
- } else if ((cls != ASN1_CTX) || (con != ASN1_CON)
- || (tag != ASN1_EOC)) {
- cFYI(1,
- ("cls = %d con = %d tag = %d end = %p (%d) exit 0",
- cls, con, tag, end, *end));
- return 0;
- }
+ /* SPNEGO OID not present or garbled -- bail out */
+ if (!rc) {
+ cFYI(1, ("Error decoding negTokenInit header"));
+ return 0;
+ }
- if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
- cFYI(1, ("Error decoding negTokenInit"));
- return 0;
- } else if ((cls != ASN1_UNI) || (con != ASN1_CON)
- || (tag != ASN1_SEQ)) {
- cFYI(1,
- ("cls = %d con = %d tag = %d end = %p (%d) exit 1",
- cls, con, tag, end, *end));
- return 0;
- }
+ if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
+ cFYI(1, ("Error decoding negTokenInit"));
+ return 0;
+ } else if ((cls != ASN1_CTX) || (con != ASN1_CON)
+ || (tag != ASN1_EOC)) {
+ cFYI(1,
+ ("cls = %d con = %d tag = %d end = %p (%d) exit 0",
+ cls, con, tag, end, *end));
+ return 0;
+ }
- if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
- cFYI(1, ("Error decoding 2nd part of negTokenInit"));
- return 0;
- } else if ((cls != ASN1_CTX) || (con != ASN1_CON)
- || (tag != ASN1_EOC)) {
- cFYI(1,
- ("cls = %d con = %d tag = %d end = %p (%d) exit 0",
- cls, con, tag, end, *end));
- return 0;
- }
+ if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
+ cFYI(1, ("Error decoding negTokenInit"));
+ return 0;
+ } else if ((cls != ASN1_UNI) || (con != ASN1_CON)
+ || (tag != ASN1_SEQ)) {
+ cFYI(1,
+ ("cls = %d con = %d tag = %d end = %p (%d) exit 1",
+ cls, con, tag, end, *end));
+ return 0;
+ }
- if (asn1_header_decode
- (&ctx, &sequence_end, &cls, &con, &tag) == 0) {
- cFYI(1, ("Error decoding 2nd part of negTokenInit"));
- return 0;
- } else if ((cls != ASN1_UNI) || (con != ASN1_CON)
- || (tag != ASN1_SEQ)) {
- cFYI(1,
- ("cls = %d con = %d tag = %d end = %p (%d) exit 1",
- cls, con, tag, end, *end));
- return 0;
- }
+ if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
+ cFYI(1, ("Error decoding 2nd part of negTokenInit"));
+ return 0;
+ } else if ((cls != ASN1_CTX) || (con != ASN1_CON)
+ || (tag != ASN1_EOC)) {
+ cFYI(1,
+ ("cls = %d con = %d tag = %d end = %p (%d) exit 0",
+ cls, con, tag, end, *end));
+ return 0;
+ }
- while (!asn1_eoc_decode(&ctx, sequence_end)) {
- rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag);
- if (!rc) {
- cFYI(1,
- ("Error decoding negTokenInit hdr exit2"));
- return 0;
- }
- if ((tag == ASN1_OJI) && (con == ASN1_PRI)) {
- if (asn1_oid_decode(&ctx, end, &oid, &oidlen)) {
-
- cFYI(1,
- ("OID len = %d oid = 0x%lx 0x%lx "
- "0x%lx 0x%lx",
- oidlen, *oid, *(oid + 1),
- *(oid + 2), *(oid + 3)));
-
- if (compare_oid(oid, oidlen,
- MSKRB5_OID,
- MSKRB5_OID_LEN))
- use_kerberos = true;
- else if (compare_oid(oid, oidlen,
- KRB5_OID,
- KRB5_OID_LEN))
- use_kerberos = true;
- else if (compare_oid(oid, oidlen,
- NTLMSSP_OID,
- NTLMSSP_OID_LEN))
- use_ntlmssp = true;
-
- kfree(oid);
- }
- } else {
- cFYI(1, ("Should be an oid what is going on?"));
- }
- }
+ if (asn1_header_decode
+ (&ctx, &sequence_end, &cls, &con, &tag) == 0) {
+ cFYI(1, ("Error decoding 2nd part of negTokenInit"));
+ return 0;
+ } else if ((cls != ASN1_UNI) || (con != ASN1_CON)
+ || (tag != ASN1_SEQ)) {
+ cFYI(1,
+ ("cls = %d con = %d tag = %d end = %p (%d) exit 1",
+ cls, con, tag, end, *end));
+ return 0;
+ }
- if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
- cFYI(1,
- ("Error decoding last part negTokenInit exit3"));
- return 0;
- } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) {
- /* tag = 3 indicating mechListMIC */
+ while (!asn1_eoc_decode(&ctx, sequence_end)) {
+ rc = asn1_header_decode(&ctx, &end, &cls, &con, &tag);
+ if (!rc) {
cFYI(1,
- ("Exit 4 cls = %d con = %d tag = %d end = %p (%d)",
- cls, con, tag, end, *end));
+ ("Error decoding negTokenInit hdr exit2"));
return 0;
}
- if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
- cFYI(1,
- ("Error decoding last part negTokenInit exit5"));
- return 0;
- } else if ((cls != ASN1_UNI) || (con != ASN1_CON)
- || (tag != ASN1_SEQ)) {
- cFYI(1, ("cls = %d con = %d tag = %d end = %p (%d)",
- cls, con, tag, end, *end));
+ if ((tag == ASN1_OJI) && (con == ASN1_PRI)) {
+ if (asn1_oid_decode(&ctx, end, &oid, &oidlen)) {
+
+ cFYI(1, ("OID len = %d oid = 0x%lx 0x%lx "
+ "0x%lx 0x%lx", oidlen, *oid,
+ *(oid + 1), *(oid + 2), *(oid + 3)));
+
+ if (compare_oid(oid, oidlen, MSKRB5_OID,
+ MSKRB5_OID_LEN) &&
+ !use_kerberos)
+ use_mskerberos = true;
+ else if (compare_oid(oid, oidlen, KRB5_OID,
+ KRB5_OID_LEN) &&
+ !use_mskerberos)
+ use_kerberos = true;
+ else if (compare_oid(oid, oidlen, NTLMSSP_OID,
+ NTLMSSP_OID_LEN))
+ use_ntlmssp = true;
+
+ kfree(oid);
+ }
+ } else {
+ cFYI(1, ("Should be an oid what is going on?"));
}
+ }
- if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
- cFYI(1,
- ("Error decoding last part negTokenInit exit 7"));
- return 0;
- } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) {
- cFYI(1,
- ("Exit 8 cls = %d con = %d tag = %d end = %p (%d)",
- cls, con, tag, end, *end));
- return 0;
- }
- if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
- cFYI(1,
- ("Error decoding last part negTokenInit exit9"));
- return 0;
- } else if ((cls != ASN1_UNI) || (con != ASN1_PRI)
- || (tag != ASN1_GENSTR)) {
- cFYI(1,
- ("Exit10 cls = %d con = %d tag = %d end = %p (%d)",
- cls, con, tag, end, *end));
- return 0;
- }
- cFYI(1, ("Need to call asn1_octets_decode() function for %s",
- ctx.pointer)); /* is this UTF-8 or ASCII? */
+ if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
+ cFYI(1, ("Error decoding last part negTokenInit exit3"));
+ return 0;
+ } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) {
+ /* tag = 3 indicating mechListMIC */
+ cFYI(1, ("Exit 4 cls = %d con = %d tag = %d end = %p (%d)",
+ cls, con, tag, end, *end));
+ return 0;
+ }
+ if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
+ cFYI(1, ("Error decoding last part negTokenInit exit5"));
+ return 0;
+ } else if ((cls != ASN1_UNI) || (con != ASN1_CON)
+ || (tag != ASN1_SEQ)) {
+ cFYI(1, ("cls = %d con = %d tag = %d end = %p (%d)",
+ cls, con, tag, end, *end));
+ }
+
+ if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
+ cFYI(1, ("Error decoding last part negTokenInit exit 7"));
+ return 0;
+ } else if ((cls != ASN1_CTX) || (con != ASN1_CON)) {
+ cFYI(1, ("Exit 8 cls = %d con = %d tag = %d end = %p (%d)",
+ cls, con, tag, end, *end));
+ return 0;
+ }
+ if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) {
+ cFYI(1, ("Error decoding last part negTokenInit exit9"));
+ return 0;
+ } else if ((cls != ASN1_UNI) || (con != ASN1_PRI)
+ || (tag != ASN1_GENSTR)) {
+ cFYI(1, ("Exit10 cls = %d con = %d tag = %d end = %p (%d)",
+ cls, con, tag, end, *end));
+ return 0;
}
+ cFYI(1, ("Need to call asn1_octets_decode() function for %s",
+ ctx.pointer)); /* is this UTF-8 or ASCII? */
if (use_kerberos)
*secType = Kerberos;
+ else if (use_mskerberos)
+ *secType = MSKerberos;
else if (use_ntlmssp)
*secType = NTLMSSP;
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index cc950f69e51..69a12aae91d 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -79,27 +79,25 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
spin_lock(&GlobalMid_Lock);
list_for_each(tmp, &server->pending_mid_q) {
mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
- if (mid_entry) {
- cERROR(1, ("State: %d Cmd: %d Pid: %d Tsk: %p Mid %d",
- mid_entry->midState,
- (int)mid_entry->command,
- mid_entry->pid,
- mid_entry->tsk,
- mid_entry->mid));
+ cERROR(1, ("State: %d Cmd: %d Pid: %d Tsk: %p Mid %d",
+ mid_entry->midState,
+ (int)mid_entry->command,
+ mid_entry->pid,
+ mid_entry->tsk,
+ mid_entry->mid));
#ifdef CONFIG_CIFS_STATS2
- cERROR(1, ("IsLarge: %d buf: %p time rcv: %ld now: %ld",
- mid_entry->largeBuf,
- mid_entry->resp_buf,
- mid_entry->when_received,
- jiffies));
+ cERROR(1, ("IsLarge: %d buf: %p time rcv: %ld now: %ld",
+ mid_entry->largeBuf,
+ mid_entry->resp_buf,
+ mid_entry->when_received,
+ jiffies));
#endif /* STATS2 */
- cERROR(1, ("IsMult: %d IsEnd: %d", mid_entry->multiRsp,
- mid_entry->multiEnd));
- if (mid_entry->resp_buf) {
- cifs_dump_detail(mid_entry->resp_buf);
- cifs_dump_mem("existing buf: ",
- mid_entry->resp_buf, 62);
- }
+ cERROR(1, ("IsMult: %d IsEnd: %d", mid_entry->multiRsp,
+ mid_entry->multiEnd));
+ if (mid_entry->resp_buf) {
+ cifs_dump_detail(mid_entry->resp_buf);
+ cifs_dump_mem("existing buf: ",
+ mid_entry->resp_buf, 62);
}
}
spin_unlock(&GlobalMid_Lock);
@@ -107,9 +105,7 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
#endif /* CONFIG_CIFS_DEBUG2 */
#ifdef CONFIG_PROC_FS
-static int
-cifs_debug_data_read(char *buf, char **beginBuffer, off_t offset,
- int count, int *eof, void *data)
+static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
{
struct list_head *tmp;
struct list_head *tmp1;
@@ -117,23 +113,13 @@ cifs_debug_data_read(char *buf, char **beginBuffer, off_t offset,
struct cifsSesInfo *ses;
struct cifsTconInfo *tcon;
int i;
- int length = 0;
- char *original_buf = buf;
-
- *beginBuffer = buf + offset;
- length =
- sprintf(buf,
+ seq_puts(m,
"Display Internal CIFS Data Structures for Debugging\n"
"---------------------------------------------------\n");
- buf += length;
- length = sprintf(buf, "CIFS Version %s\n", CIFS_VERSION);
- buf += length;
- length = sprintf(buf,
- "Active VFS Requests: %d\n", GlobalTotalActiveXid);
- buf += length;
- length = sprintf(buf, "Servers:");
- buf += length;
+ seq_printf(m, "CIFS Version %s\n", CIFS_VERSION);
+ seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid);
+ seq_printf(m, "Servers:");
i = 0;
read_lock(&GlobalSMBSeslock);
@@ -142,11 +128,10 @@ cifs_debug_data_read(char *buf, char **beginBuffer, off_t offset,
ses = list_entry(tmp, struct cifsSesInfo, cifsSessionList);
if ((ses->serverDomain == NULL) || (ses->serverOS == NULL) ||
(ses->serverNOS == NULL)) {
- buf += sprintf(buf, "\nentry for %s not fully "
+ seq_printf(m, "\nentry for %s not fully "
"displayed\n\t", ses->serverName);
} else {
- length =
- sprintf(buf,
+ seq_printf(m,
"\n%d) Name: %s Domain: %s Mounts: %d OS:"
" %s \n\tNOS: %s\tCapability: 0x%x\n\tSMB"
" session status: %d\t",
@@ -154,10 +139,9 @@ cifs_debug_data_read(char *buf, char **beginBuffer, off_t offset,
atomic_read(&ses->inUse),
ses->serverOS, ses->serverNOS,
ses->capabilities, ses->status);
- buf += length;
}
if (ses->server) {
- buf += sprintf(buf, "TCP status: %d\n\tLocal Users To "
+ seq_printf(m, "TCP status: %d\n\tLocal Users To "
"Server: %d SecMode: 0x%x Req On Wire: %d",
ses->server->tcpStatus,
atomic_read(&ses->server->socketUseCount),
@@ -165,41 +149,34 @@ cifs_debug_data_read(char *buf, char **beginBuffer, off_t offset,
atomic_read(&ses->server->inFlight));
#ifdef CONFIG_CIFS_STATS2
- buf += sprintf(buf, " In Send: %d In MaxReq Wait: %d",
+ seq_printf(m, " In Send: %d In MaxReq Wait: %d",
atomic_read(&ses->server->inSend),
atomic_read(&ses->server->num_waiters));
#endif
- length = sprintf(buf, "\nMIDs:\n");
- buf += length;
+ seq_puts(m, "\nMIDs:\n");
spin_lock(&GlobalMid_Lock);
list_for_each(tmp1, &ses->server->pending_mid_q) {
mid_entry = list_entry(tmp1, struct
mid_q_entry,
qhead);
- if (mid_entry) {
- length = sprintf(buf,
- "State: %d com: %d pid:"
- " %d tsk: %p mid %d\n",
- mid_entry->midState,
- (int)mid_entry->command,
- mid_entry->pid,
- mid_entry->tsk,
- mid_entry->mid);
- buf += length;
- }
+ seq_printf(m, "State: %d com: %d pid:"
+ " %d tsk: %p mid %d\n",
+ mid_entry->midState,
+ (int)mid_entry->command,
+ mid_entry->pid,
+ mid_entry->tsk,
+ mid_entry->mid);
}
spin_unlock(&GlobalMid_Lock);
}
}
read_unlock(&GlobalSMBSeslock);
- sprintf(buf, "\n");
- buf++;
+ seq_putc(m, '\n');
- length = sprintf(buf, "Shares:");
- buf += length;
+ seq_puts(m, "Shares:");
i = 0;
read_lock(&GlobalSMBSeslock);
@@ -208,62 +185,52 @@ cifs_debug_data_read(char *buf, char **beginBuffer, off_t offset,
i++;
tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
dev_type = le32_to_cpu(tcon->fsDevInfo.DeviceType);
- length = sprintf(buf, "\n%d) %s Uses: %d ", i,
+ seq_printf(m, "\n%d) %s Uses: %d ", i,
tcon->treeName, atomic_read(&tcon->useCount));
- buf += length;
if (tcon->nativeFileSystem) {
- length = sprintf(buf, "Type: %s ",
+ seq_printf(m, "Type: %s ",
tcon->nativeFileSystem);
- buf += length;
}
- length = sprintf(buf, "DevInfo: 0x%x Attributes: 0x%x"
+ seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x"
"\nPathComponentMax: %d Status: %d",
le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics),
le32_to_cpu(tcon->fsAttrInfo.Attributes),
le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength),
tcon->tidStatus);
- buf += length;
if (dev_type == FILE_DEVICE_DISK)
- length = sprintf(buf, " type: DISK ");
+ seq_puts(m, " type: DISK ");
else if (dev_type == FILE_DEVICE_CD_ROM)
- length = sprintf(buf, " type: CDROM ");
+ seq_puts(m, " type: CDROM ");
else
- length =
- sprintf(buf, " type: %d ", dev_type);
- buf += length;
- if (tcon->tidStatus == CifsNeedReconnect) {
- buf += sprintf(buf, "\tDISCONNECTED ");
- length += 14;
- }
+ seq_printf(m, " type: %d ", dev_type);
+
+ if (tcon->tidStatus == CifsNeedReconnect)
+ seq_puts(m, "\tDISCONNECTED ");
}
read_unlock(&GlobalSMBSeslock);
- length = sprintf(buf, "\n");
- buf += length;
+ seq_putc(m, '\n');
/* BB add code to dump additional info such as TCP session info now */
- /* Now calculate total size of returned data */
- length = buf - original_buf;
-
- if (offset + count >= length)
- *eof = 1;
- if (length < offset) {
- *eof = 1;
- return 0;
- } else {
- length = length - offset;
- }
- if (length > count)
- length = count;
+ return 0;
+}
- return length;
+static int cifs_debug_data_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, cifs_debug_data_proc_show, NULL);
}
-#ifdef CONFIG_CIFS_STATS
+static const struct file_operations cifs_debug_data_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = cifs_debug_data_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
-static int
-cifs_stats_write(struct file *file, const char __user *buffer,
- unsigned long count, void *data)
+#ifdef CONFIG_CIFS_STATS
+static ssize_t cifs_stats_proc_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *ppos)
{
char c;
int rc;
@@ -307,236 +274,132 @@ cifs_stats_write(struct file *file, const char __user *buffer,
return count;
}
-static int
-cifs_stats_read(char *buf, char **beginBuffer, off_t offset,
- int count, int *eof, void *data)
+static int cifs_stats_proc_show(struct seq_file *m, void *v)
{
- int item_length, i, length;
+ int i;
struct list_head *tmp;
struct cifsTconInfo *tcon;
- *beginBuffer = buf + offset;
-
- length = sprintf(buf,
+ seq_printf(m,
"Resources in use\nCIFS Session: %d\n",
sesInfoAllocCount.counter);
- buf += length;
- item_length =
- sprintf(buf, "Share (unique mount targets): %d\n",
+ seq_printf(m, "Share (unique mount targets): %d\n",
tconInfoAllocCount.counter);
- length += item_length;
- buf += item_length;
- item_length =
- sprintf(buf, "SMB Request/Response Buffer: %d Pool size: %d\n",
+ seq_printf(m, "SMB Request/Response Buffer: %d Pool size: %d\n",
bufAllocCount.counter,
cifs_min_rcv + tcpSesAllocCount.counter);
- length += item_length;
- buf += item_length;
- item_length =
- sprintf(buf, "SMB Small Req/Resp Buffer: %d Pool size: %d\n",
+ seq_printf(m, "SMB Small Req/Resp Buffer: %d Pool size: %d\n",
smBufAllocCount.counter, cifs_min_small);
- length += item_length;
- buf += item_length;
#ifdef CONFIG_CIFS_STATS2
- item_length = sprintf(buf, "Total Large %d Small %d Allocations\n",
+ seq_printf(m, "Total Large %d Small %d Allocations\n",
atomic_read(&totBufAllocCount),
atomic_read(&totSmBufAllocCount));
- length += item_length;
- buf += item_length;
#endif /* CONFIG_CIFS_STATS2 */
- item_length =
- sprintf(buf, "Operations (MIDs): %d\n",
- midCount.counter);
- length += item_length;
- buf += item_length;
- item_length = sprintf(buf,
+ seq_printf(m, "Operations (MIDs): %d\n", midCount.counter);
+ seq_printf(m,
"\n%d session %d share reconnects\n",
tcpSesReconnectCount.counter, tconInfoReconnectCount.counter);
- length += item_length;
- buf += item_length;
- item_length = sprintf(buf,
+ seq_printf(m,
"Total vfs operations: %d maximum at one time: %d\n",
GlobalCurrentXid, GlobalMaxActiveXid);
- length += item_length;
- buf += item_length;
i = 0;
read_lock(&GlobalSMBSeslock);
list_for_each(tmp, &GlobalTreeConnectionList) {
i++;
tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
- item_length = sprintf(buf, "\n%d) %s", i, tcon->treeName);
- buf += item_length;
- length += item_length;
- if (tcon->tidStatus == CifsNeedReconnect) {
- buf += sprintf(buf, "\tDISCONNECTED ");
- length += 14;
- }
- item_length = sprintf(buf, "\nSMBs: %d Oplock Breaks: %d",
+ seq_printf(m, "\n%d) %s", i, tcon->treeName);
+ if (tcon->tidStatus == CifsNeedReconnect)
+ seq_puts(m, "\tDISCONNECTED ");
+ seq_printf(m, "\nSMBs: %d Oplock Breaks: %d",
atomic_read(&tcon->num_smbs_sent),
atomic_read(&tcon->num_oplock_brks));
- buf += item_length;
- length += item_length;
- item_length = sprintf(buf, "\nReads: %d Bytes: %lld",
+ seq_printf(m, "\nReads: %d Bytes: %lld",
atomic_read(&tcon->num_reads),
(long long)(tcon->bytes_read));
- buf += item_length;
- length += item_length;
- item_length = sprintf(buf, "\nWrites: %d Bytes: %lld",
+ seq_printf(m, "\nWrites: %d Bytes: %lld",
atomic_read(&tcon->num_writes),
(long long)(tcon->bytes_written));
- buf += item_length;
- length += item_length;
- item_length = sprintf(buf,
+ seq_printf(m,
"\nLocks: %d HardLinks: %d Symlinks: %d",
atomic_read(&tcon->num_locks),
atomic_read(&tcon->num_hardlinks),
atomic_read(&tcon->num_symlinks));
- buf += item_length;
- length += item_length;
- item_length = sprintf(buf, "\nOpens: %d Closes: %d Deletes: %d",
+ seq_printf(m, "\nOpens: %d Closes: %d Deletes: %d",
atomic_read(&tcon->num_opens),
atomic_read(&tcon->num_closes),
atomic_read(&tcon->num_deletes));
- buf += item_length;
- length += item_length;
- item_length = sprintf(buf, "\nMkdirs: %d Rmdirs: %d",
+ seq_printf(m, "\nMkdirs: %d Rmdirs: %d",
atomic_read(&tcon->num_mkdirs),
atomic_read(&tcon->num_rmdirs));
- buf += item_length;
- length += item_length;
- item_length = sprintf(buf, "\nRenames: %d T2 Renames %d",
+ seq_printf(m, "\nRenames: %d T2 Renames %d",
atomic_read(&tcon->num_renames),
atomic_read(&tcon->num_t2renames));
- buf += item_length;
- length += item_length;
- item_length = sprintf(buf, "\nFindFirst: %d FNext %d FClose %d",
+ seq_printf(m, "\nFindFirst: %d FNext %d FClose %d",
atomic_read(&tcon->num_ffirst),
atomic_read(&tcon->num_fnext),
atomic_read(&tcon->num_fclose));
- buf += item_length;
- length += item_length;
}
read_unlock(&GlobalSMBSeslock);
- buf += sprintf(buf, "\n");
- length++;
-
- if (offset + count >= length)
- *eof = 1;
- if (length < offset) {
- *eof = 1;
- return 0;
- } else {
- length = length - offset;
- }
- if (length > count)
- length = count;
+ seq_putc(m, '\n');
+ return 0;
+}
- return length;
+static int cifs_stats_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, cifs_stats_proc_show, NULL);
}
+
+static const struct file_operations cifs_stats_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = cifs_stats_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+ .write = cifs_stats_proc_write,
+};
#endif /* STATS */
static struct proc_dir_entry *proc_fs_cifs;
-read_proc_t cifs_txanchor_read;
-static read_proc_t cifsFYI_read;
-static write_proc_t cifsFYI_write;
-static read_proc_t oplockEnabled_read;
-static write_proc_t oplockEnabled_write;
-static read_proc_t lookupFlag_read;
-static write_proc_t lookupFlag_write;
-static read_proc_t traceSMB_read;
-static write_proc_t traceSMB_write;
-static read_proc_t multiuser_mount_read;
-static write_proc_t multiuser_mount_write;
-static read_proc_t security_flags_read;
-static write_proc_t security_flags_write;
-/* static read_proc_t ntlmv2_enabled_read;
-static write_proc_t ntlmv2_enabled_write;
-static read_proc_t packet_signing_enabled_read;
-static write_proc_t packet_signing_enabled_write;*/
-static read_proc_t experimEnabled_read;
-static write_proc_t experimEnabled_write;
-static read_proc_t linuxExtensionsEnabled_read;
-static write_proc_t linuxExtensionsEnabled_write;
+static const struct file_operations cifsFYI_proc_fops;
+static const struct file_operations cifs_oplock_proc_fops;
+static const struct file_operations cifs_lookup_cache_proc_fops;
+static const struct file_operations traceSMB_proc_fops;
+static const struct file_operations cifs_multiuser_mount_proc_fops;
+static const struct file_operations cifs_security_flags_proc_fops;
+static const struct file_operations cifs_experimental_proc_fops;
+static const struct file_operations cifs_linux_ext_proc_fops;
void
cifs_proc_init(void)
{
- struct proc_dir_entry *pde;
-
proc_fs_cifs = proc_mkdir("fs/cifs", NULL);
if (proc_fs_cifs == NULL)
return;
proc_fs_cifs->owner = THIS_MODULE;
- create_proc_read_entry("DebugData", 0, proc_fs_cifs,
- cifs_debug_data_read, NULL);
+ proc_create("DebugData", 0, proc_fs_cifs, &cifs_debug_data_proc_fops);
#ifdef CONFIG_CIFS_STATS
- pde = create_proc_read_entry("Stats", 0, proc_fs_cifs,
- cifs_stats_read, NULL);
- if (pde)
- pde->write_proc = cifs_stats_write;
+ proc_create("Stats", 0, proc_fs_cifs, &cifs_stats_proc_fops);
#endif /* STATS */
- pde = create_proc_read_entry("cifsFYI", 0, proc_fs_cifs,
- cifsFYI_read, NULL);
- if (pde)
- pde->write_proc = cifsFYI_write;
-
- pde =
- create_proc_read_entry("traceSMB", 0, proc_fs_cifs,
- traceSMB_read, NULL);
- if (pde)
- pde->write_proc = traceSMB_write;
-
- pde = create_proc_read_entry("OplockEnabled", 0, proc_fs_cifs,
- oplockEnabled_read, NULL);
- if (pde)
- pde->write_proc = oplockEnabled_write;
-
- pde = create_proc_read_entry("Experimental", 0, proc_fs_cifs,
- experimEnabled_read, NULL);
- if (pde)
- pde->write_proc = experimEnabled_write;
-
- pde = create_proc_read_entry("LinuxExtensionsEnabled", 0, proc_fs_cifs,
- linuxExtensionsEnabled_read, NULL);
- if (pde)
- pde->write_proc = linuxExtensionsEnabled_write;
-
- pde =
- create_proc_read_entry("MultiuserMount", 0, proc_fs_cifs,
- multiuser_mount_read, NULL);
- if (pde)
- pde->write_proc = multiuser_mount_write;
-
- pde =
- create_proc_read_entry("SecurityFlags", 0, proc_fs_cifs,
- security_flags_read, NULL);
- if (pde)
- pde->write_proc = security_flags_write;
-
- pde =
- create_proc_read_entry("LookupCacheEnabled", 0, proc_fs_cifs,
- lookupFlag_read, NULL);
- if (pde)
- pde->write_proc = lookupFlag_write;
-
-/* pde =
- create_proc_read_entry("NTLMV2Enabled", 0, proc_fs_cifs,
- ntlmv2_enabled_read, NULL);
- if (pde)
- pde->write_proc = ntlmv2_enabled_write;
-
- pde =
- create_proc_read_entry("PacketSigningEnabled", 0, proc_fs_cifs,
- packet_signing_enabled_read, NULL);
- if (pde)
- pde->write_proc = packet_signing_enabled_write;*/
+ proc_create("cifsFYI", 0, proc_fs_cifs, &cifsFYI_proc_fops);
+ proc_create("traceSMB", 0, proc_fs_cifs, &traceSMB_proc_fops);
+ proc_create("OplockEnabled", 0, proc_fs_cifs, &cifs_oplock_proc_fops);
+ proc_create("Experimental", 0, proc_fs_cifs,
+ &cifs_experimental_proc_fops);
+ proc_create("LinuxExtensionsEnabled", 0, proc_fs_cifs,
+ &cifs_linux_ext_proc_fops);
+ proc_create("MultiuserMount", 0, proc_fs_cifs,
+ &cifs_multiuser_mount_proc_fops);
+ proc_create("SecurityFlags", 0, proc_fs_cifs,
+ &cifs_security_flags_proc_fops);
+ proc_create("LookupCacheEnabled", 0, proc_fs_cifs,
+ &cifs_lookup_cache_proc_fops);
}
void
@@ -553,39 +416,26 @@ cifs_proc_clean(void)
#endif
remove_proc_entry("MultiuserMount", proc_fs_cifs);
remove_proc_entry("OplockEnabled", proc_fs_cifs);
-/* remove_proc_entry("NTLMV2Enabled",proc_fs_cifs); */
remove_proc_entry("SecurityFlags", proc_fs_cifs);
-/* remove_proc_entry("PacketSigningEnabled", proc_fs_cifs); */
remove_proc_entry("LinuxExtensionsEnabled", proc_fs_cifs);
remove_proc_entry("Experimental", proc_fs_cifs);
remove_proc_entry("LookupCacheEnabled", proc_fs_cifs);
remove_proc_entry("fs/cifs", NULL);
}
-static int
-cifsFYI_read(char *page, char **start, off_t off, int count,
- int *eof, void *data)
+static int cifsFYI_proc_show(struct seq_file *m, void *v)
{
- int len;
-
- len = sprintf(page, "%d\n", cifsFYI);
-
- len -= off;
- *start = page + off;
-
- if (len > count)
- len = count;
- else
- *eof = 1;
-
- if (len < 0)
- len = 0;
+ seq_printf(m, "%d\n", cifsFYI);
+ return 0;
+}
- return len;
+static int cifsFYI_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, cifsFYI_proc_show, NULL);
}
-static int
-cifsFYI_write(struct file *file, const char __user *buffer,
- unsigned long count, void *data)
+
+static ssize_t cifsFYI_proc_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos)
{
char c;
int rc;
@@ -603,30 +453,28 @@ cifsFYI_write(struct file *file, const char __user *buffer,
return count;
}
-static int
-oplockEnabled_read(char *page, char **start, off_t off,
- int count, int *eof, void *data)
-{
- int len;
-
- len = sprintf(page, "%d\n", oplockEnabled);
-
- len -= off;
- *start = page + off;
+static const struct file_operations cifsFYI_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = cifsFYI_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+ .write = cifsFYI_proc_write,
+};
- if (len > count)
- len = count;
- else
- *eof = 1;
-
- if (len < 0)
- len = 0;
+static int cifs_oplock_proc_show(struct seq_file *m, void *v)
+{
+ seq_printf(m, "%d\n", oplockEnabled);
+ return 0;
+}
- return len;
+static int cifs_oplock_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, cifs_oplock_proc_show, NULL);
}
-static int
-oplockEnabled_write(struct file *file, const char __user *buffer,
- unsigned long count, void *data)
+
+static ssize_t cifs_oplock_proc_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *ppos)
{
char c;
int rc;
@@ -642,30 +490,28 @@ oplockEnabled_write(struct file *file, const char __user *buffer,
return count;
}
-static int
-experimEnabled_read(char *page, char **start, off_t off,
- int count, int *eof, void *data)
-{
- int len;
-
- len = sprintf(page, "%d\n", experimEnabled);
+static const struct file_operations cifs_oplock_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = cifs_oplock_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+ .write = cifs_oplock_proc_write,
+};
- len -= off;
- *start = page + off;
-
- if (len > count)
- len = count;
- else
- *eof = 1;
-
- if (len < 0)
- len = 0;
+static int cifs_experimental_proc_show(struct seq_file *m, void *v)
+{
+ seq_printf(m, "%d\n", experimEnabled);
+ return 0;
+}
- return len;
+static int cifs_experimental_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, cifs_experimental_proc_show, NULL);
}
-static int
-experimEnabled_write(struct file *file, const char __user *buffer,
- unsigned long count, void *data)
+
+static ssize_t cifs_experimental_proc_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *ppos)
{
char c;
int rc;
@@ -683,29 +529,28 @@ experimEnabled_write(struct file *file, const char __user *buffer,
return count;
}
-static int
-linuxExtensionsEnabled_read(char *page, char **start, off_t off,
- int count, int *eof, void *data)
-{
- int len;
-
- len = sprintf(page, "%d\n", linuxExtEnabled);
- len -= off;
- *start = page + off;
+static const struct file_operations cifs_experimental_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = cifs_experimental_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+ .write = cifs_experimental_proc_write,
+};
- if (len > count)
- len = count;
- else
- *eof = 1;
-
- if (len < 0)
- len = 0;
+static int cifs_linux_ext_proc_show(struct seq_file *m, void *v)
+{
+ seq_printf(m, "%d\n", linuxExtEnabled);
+ return 0;
+}
- return len;
+static int cifs_linux_ext_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, cifs_linux_ext_proc_show, NULL);
}
-static int
-linuxExtensionsEnabled_write(struct file *file, const char __user *buffer,
- unsigned long count, void *data)
+
+static ssize_t cifs_linux_ext_proc_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *ppos)
{
char c;
int rc;
@@ -721,31 +566,28 @@ linuxExtensionsEnabled_write(struct file *file, const char __user *buffer,
return count;
}
+static const struct file_operations cifs_linux_ext_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = cifs_linux_ext_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+ .write = cifs_linux_ext_proc_write,
+};
-static int
-lookupFlag_read(char *page, char **start, off_t off,
- int count, int *eof, void *data)
+static int cifs_lookup_cache_proc_show(struct seq_file *m, void *v)
{
- int len;
-
- len = sprintf(page, "%d\n", lookupCacheEnabled);
-
- len -= off;
- *start = page + off;
-
- if (len > count)
- len = count;
- else
- *eof = 1;
-
- if (len < 0)
- len = 0;
+ seq_printf(m, "%d\n", lookupCacheEnabled);
+ return 0;
+}
- return len;
+static int cifs_lookup_cache_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, cifs_lookup_cache_proc_show, NULL);
}
-static int
-lookupFlag_write(struct file *file, const char __user *buffer,
- unsigned long count, void *data)
+
+static ssize_t cifs_lookup_cache_proc_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *ppos)
{
char c;
int rc;
@@ -760,30 +602,29 @@ lookupFlag_write(struct file *file, const char __user *buffer,
return count;
}
-static int
-traceSMB_read(char *page, char **start, off_t off, int count,
- int *eof, void *data)
-{
- int len;
-
- len = sprintf(page, "%d\n", traceSMB);
-
- len -= off;
- *start = page + off;
- if (len > count)
- len = count;
- else
- *eof = 1;
+static const struct file_operations cifs_lookup_cache_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = cifs_lookup_cache_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+ .write = cifs_lookup_cache_proc_write,
+};
- if (len < 0)
- len = 0;
+static int traceSMB_proc_show(struct seq_file *m, void *v)
+{
+ seq_printf(m, "%d\n", traceSMB);
+ return 0;
+}
- return len;
+static int traceSMB_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, traceSMB_proc_show, NULL);
}
-static int
-traceSMB_write(struct file *file, const char __user *buffer,
- unsigned long count, void *data)
+
+static ssize_t traceSMB_proc_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos)
{
char c;
int rc;
@@ -799,30 +640,28 @@ traceSMB_write(struct file *file, const char __user *buffer,
return count;
}
-static int
-multiuser_mount_read(char *page, char **start, off_t off,
- int count, int *eof, void *data)
-{
- int len;
-
- len = sprintf(page, "%d\n", multiuser_mount);
-
- len -= off;
- *start = page + off;
+static const struct file_operations traceSMB_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = traceSMB_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+ .write = traceSMB_proc_write,
+};
- if (len > count)
- len = count;
- else
- *eof = 1;
-
- if (len < 0)
- len = 0;
+static int cifs_multiuser_mount_proc_show(struct seq_file *m, void *v)
+{
+ seq_printf(m, "%d\n", multiuser_mount);
+ return 0;
+}
- return len;
+static int cifs_multiuser_mount_proc_open(struct inode *inode, struct file *fh)
+{
+ return single_open(fh, cifs_multiuser_mount_proc_show, NULL);
}
-static int
-multiuser_mount_write(struct file *file, const char __user *buffer,
- unsigned long count, void *data)
+
+static ssize_t cifs_multiuser_mount_proc_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *ppos)
{
char c;
int rc;
@@ -838,30 +677,28 @@ multiuser_mount_write(struct file *file, const char __user *buffer,
return count;
}
-static int
-security_flags_read(char *page, char **start, off_t off,
- int count, int *eof, void *data)
-{
- int len;
-
- len = sprintf(page, "0x%x\n", extended_security);
-
- len -= off;
- *start = page + off;
+static const struct file_operations cifs_multiuser_mount_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = cifs_multiuser_mount_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+ .write = cifs_multiuser_mount_proc_write,
+};
- if (len > count)
- len = count;
- else
- *eof = 1;
-
- if (len < 0)
- len = 0;
+static int cifs_security_flags_proc_show(struct seq_file *m, void *v)
+{
+ seq_printf(m, "0x%x\n", extended_security);
+ return 0;
+}
- return len;
+static int cifs_security_flags_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, cifs_security_flags_proc_show, NULL);
}
-static int
-security_flags_write(struct file *file, const char __user *buffer,
- unsigned long count, void *data)
+
+static ssize_t cifs_security_flags_proc_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *ppos)
{
unsigned int flags;
char flags_string[12];
@@ -917,6 +754,15 @@ security_flags_write(struct file *file, const char __user *buffer,
/* BB should we turn on MAY flags for other MUST options? */
return count;
}
+
+static const struct file_operations cifs_security_flags_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = cifs_security_flags_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+ .write = cifs_security_flags_proc_write,
+};
#else
inline void cifs_proc_init(void)
{
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index d82374c9e32..d2c8eef84f3 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -226,7 +226,7 @@ static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
int err;
mntget(newmnt);
- err = do_add_mount(newmnt, nd, nd->path.mnt->mnt_flags, mntlist);
+ err = do_add_mount(newmnt, &nd->path, nd->path.mnt->mnt_flags, mntlist);
switch (err) {
case 0:
path_put(&nd->path);
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 7013aaff6ae..fcee9298b62 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -66,11 +66,28 @@ struct key_type cifs_spnego_key_type = {
.describe = user_describe,
};
-#define MAX_VER_STR_LEN 9 /* length of longest version string e.g.
- strlen(";ver=0xFF") */
-#define MAX_MECH_STR_LEN 13 /* length of longest security mechanism name, eg
- in future could have strlen(";sec=ntlmsspi") */
-#define MAX_IPV6_ADDR_LEN 42 /* eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/60 */
+/* length of longest version string e.g. strlen("ver=0xFF") */
+#define MAX_VER_STR_LEN 8
+
+/* length of longest security mechanism name, eg in future could have
+ * strlen(";sec=ntlmsspi") */
+#define MAX_MECH_STR_LEN 13
+
+/* max possible addr len eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/60 */
+#define MAX_IPV6_ADDR_LEN 42
+
+/* strlen of "host=" */
+#define HOST_KEY_LEN 5
+
+/* strlen of ";ip4=" or ";ip6=" */
+#define IP_KEY_LEN 5
+
+/* strlen of ";uid=0x" */
+#define UID_KEY_LEN 7
+
+/* strlen of ";user=" */
+#define USER_KEY_LEN 6
+
/* get a key struct with a SPNEGO security blob, suitable for session setup */
struct key *
cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
@@ -81,11 +98,15 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
struct key *spnego_key;
const char *hostname = server->hostname;
- /* BB: come up with better scheme for determining length */
- /* length of fields (with semicolons): ver=0xyz ipv4= ipaddress host=
- hostname sec=mechanism uid=0x uid */
- desc_len = MAX_VER_STR_LEN + 5 + MAX_IPV6_ADDR_LEN + 1 + 6 +
- strlen(hostname) + MAX_MECH_STR_LEN + 8 + (sizeof(uid_t) * 2);
+ /* length of fields (with semicolons): ver=0xyz ip4=ipaddress
+ host=hostname sec=mechanism uid=0xFF user=username */
+ desc_len = MAX_VER_STR_LEN +
+ HOST_KEY_LEN + strlen(hostname) +
+ IP_KEY_LEN + MAX_IPV6_ADDR_LEN +
+ MAX_MECH_STR_LEN +
+ UID_KEY_LEN + (sizeof(uid_t) * 2) +
+ USER_KEY_LEN + strlen(sesInfo->userName) + 1;
+
spnego_key = ERR_PTR(-ENOMEM);
description = kzalloc(desc_len, GFP_KERNEL);
if (description == NULL)
@@ -110,9 +131,11 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
dp = description + strlen(description);
- /* for now, only sec=krb5 is valid */
+ /* for now, only sec=krb5 and sec=mskrb5 are valid */
if (server->secType == Kerberos)
sprintf(dp, ";sec=krb5");
+ else if (server->secType == MSKerberos)
+ sprintf(dp, ";sec=mskrb5");
else
goto out;
diff --git a/fs/cifs/cifs_spnego.h b/fs/cifs/cifs_spnego.h
index 05a34b17a1a..e4041ec4d71 100644
--- a/fs/cifs/cifs_spnego.h
+++ b/fs/cifs/cifs_spnego.h
@@ -23,7 +23,7 @@
#ifndef _CIFS_SPNEGO_H
#define _CIFS_SPNEGO_H
-#define CIFS_SPNEGO_UPCALL_VERSION 1
+#define CIFS_SPNEGO_UPCALL_VERSION 2
/*
* The version field should always be set to CIFS_SPNEGO_UPCALL_VERSION.
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 34902cff540..57ecdc83c26 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -34,11 +34,11 @@
static struct cifs_wksid wksidarr[NUM_WK_SIDS] = {
{{1, 0, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0} }, "null user"},
{{1, 1, {0, 0, 0, 0, 0, 1}, {0, 0, 0, 0, 0} }, "nobody"},
- {{1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(11), 0, 0, 0, 0} }, "net-users"},
- {{1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(18), 0, 0, 0, 0} }, "sys"},
- {{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(544), 0, 0, 0} }, "root"},
- {{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(545), 0, 0, 0} }, "users"},
- {{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(546), 0, 0, 0} }, "guest"} }
+ {{1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(11), 0, 0, 0, 0} }, "net-users"},
+ {{1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(18), 0, 0, 0, 0} }, "sys"},
+ {{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(544), 0, 0, 0} }, "root"},
+ {{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(545), 0, 0, 0} }, "users"},
+ {{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(546), 0, 0, 0} }, "guest"} }
;
@@ -56,7 +56,7 @@ int match_sid(struct cifs_sid *ctsid)
struct cifs_sid *cwsid;
if (!ctsid)
- return (-1);
+ return -1;
for (i = 0; i < NUM_WK_SIDS; ++i) {
cwsid = &(wksidarr[i].cifssid);
@@ -87,11 +87,11 @@ int match_sid(struct cifs_sid *ctsid)
}
cFYI(1, ("matching sid: %s\n", wksidarr[i].sidname));
- return (0); /* sids compare/match */
+ return 0; /* sids compare/match */
}
cFYI(1, ("No matching sid"));
- return (-1);
+ return -1;
}
/* if the two SIDs (roughly equivalent to a UUID for a user or group) are
@@ -102,16 +102,16 @@ int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
int num_subauth, num_sat, num_saw;
if ((!ctsid) || (!cwsid))
- return (0);
+ return 0;
/* compare the revision */
if (ctsid->revision != cwsid->revision)
- return (0);
+ return 0;
/* compare all of the six auth values */
for (i = 0; i < 6; ++i) {
if (ctsid->authority[i] != cwsid->authority[i])
- return (0);
+ return 0;
}
/* compare all of the subauth values if any */
@@ -121,11 +121,11 @@ int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
if (num_subauth) {
for (i = 0; i < num_subauth; ++i) {
if (ctsid->sub_auth[i] != cwsid->sub_auth[i])
- return (0);
+ return 0;
}
}
- return (1); /* sids compare/match */
+ return 1; /* sids compare/match */
}
@@ -169,8 +169,7 @@ static void copy_sec_desc(const struct cifs_ntsd *pntsd,
for (i = 0; i < 6; i++)
ngroup_sid_ptr->authority[i] = group_sid_ptr->authority[i];
for (i = 0; i < 5; i++)
- ngroup_sid_ptr->sub_auth[i] =
- cpu_to_le32(group_sid_ptr->sub_auth[i]);
+ ngroup_sid_ptr->sub_auth[i] = group_sid_ptr->sub_auth[i];
return;
}
@@ -285,7 +284,7 @@ static __u16 fill_ace_for_sid(struct cifs_ace *pntace,
size = 1 + 1 + 2 + 4 + 1 + 1 + 6 + (psid->num_subauth * 4);
pntace->size = cpu_to_le16(size);
- return (size);
+ return size;
}
@@ -426,7 +425,7 @@ static int set_chmod_dacl(struct cifs_acl *pndacl, struct cifs_sid *pownersid,
pndacl->size = cpu_to_le16(size + sizeof(struct cifs_acl));
pndacl->num_aces = cpu_to_le32(3);
- return (0);
+ return 0;
}
@@ -510,7 +509,7 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len,
sizeof(struct cifs_sid)); */
- return (0);
+ return 0;
}
@@ -527,7 +526,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
struct cifs_acl *ndacl_ptr = NULL; /* no need for SACL ptr */
if ((inode == NULL) || (pntsd == NULL) || (pnntsd == NULL))
- return (-EIO);
+ return -EIO;
owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
le32_to_cpu(pntsd->osidoffset));
@@ -550,7 +549,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
/* copy security descriptor control portion and owner and group sid */
copy_sec_desc(pntsd, pnntsd, sidsoffset);
- return (rc);
+ return rc;
}
@@ -629,11 +628,11 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
cFYI(DBG2, ("set ACL for %s from mode 0x%x", path, inode->i_mode));
if (!inode)
- return (rc);
+ return rc;
sb = inode->i_sb;
if (sb == NULL)
- return (rc);
+ return rc;
cifs_sb = CIFS_SB(sb);
xid = GetXid();
@@ -652,7 +651,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
if (rc != 0) {
cERROR(1, ("Unable to open file to set ACL"));
FreeXid(xid);
- return (rc);
+ return rc;
}
}
@@ -665,7 +664,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
FreeXid(xid);
- return (rc);
+ return rc;
}
/* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
@@ -715,7 +714,7 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
if (!pnntsd) {
cERROR(1, ("Unable to allocate security descriptor"));
kfree(pntsd);
- return (-ENOMEM);
+ return -ENOMEM;
}
rc = build_sec_desc(pntsd, pnntsd, inode, nmode);
@@ -732,6 +731,6 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
kfree(pntsd);
}
- return (rc);
+ return rc;
}
#endif /* CONFIG_CIFS_EXPERIMENTAL */
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 4ff8939c6cc..bd5f13d3845 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -294,6 +294,7 @@ void calc_lanman_hash(struct cifsSesInfo *ses, char *lnm_session_key)
if ((ses->server->secMode & SECMODE_PW_ENCRYPT) == 0)
if (extended_security & CIFSSEC_MAY_PLNTXT) {
+ memset(lnm_session_key, 0, CIFS_SESS_KEY_SIZE);
memcpy(lnm_session_key, password_with_pad,
CIFS_ENCPWD_SIZE);
return;
@@ -310,9 +311,8 @@ void calc_lanman_hash(struct cifsSesInfo *ses, char *lnm_session_key)
utf8 and other multibyte codepages each need their own strupper
function since a byte at a time will ont work. */
- for (i = 0; i < CIFS_ENCPWD_SIZE; i++) {
+ for (i = 0; i < CIFS_ENCPWD_SIZE; i++)
password_with_pad[i] = toupper(password_with_pad[i]);
- }
SMBencrypt(password_with_pad, ses->server->cryptKey, lnm_session_key);
/* clear password before we return/free memory */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 86b4d5f405a..25ecbd5b040 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -175,6 +175,8 @@ out_no_root:
if (inode)
iput(inode);
+ cifs_umount(sb, cifs_sb);
+
out_mount_failed:
if (cifs_sb) {
#ifdef CONFIG_CIFS_DFS_UPCALL
@@ -267,7 +269,7 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
-static int cifs_permission(struct inode *inode, int mask, struct nameidata *nd)
+static int cifs_permission(struct inode *inode, int mask)
{
struct cifs_sb_info *cifs_sb;
@@ -612,7 +614,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
if (retval < 0)
return (loff_t)retval;
}
- return remote_llseek(file, offset, origin);
+ return generic_file_llseek_unlocked(file, offset, origin);
}
struct file_system_type cifs_fs_type = {
@@ -766,7 +768,7 @@ const struct file_operations cifs_dir_ops = {
};
static void
-cifs_init_once(struct kmem_cache *cachep, void *inode)
+cifs_init_once(void *inode)
{
struct cifsInodeInfo *cifsi = inode;
@@ -930,36 +932,34 @@ static int cifs_oplock_thread(void *dummyarg)
schedule_timeout(39*HZ);
} else {
oplock_item = list_entry(GlobalOplock_Q.next,
- struct oplock_q_entry, qhead);
- if (oplock_item) {
- cFYI(1, ("found oplock item to write out"));
- pTcon = oplock_item->tcon;
- inode = oplock_item->pinode;
- netfid = oplock_item->netfid;
- spin_unlock(&GlobalMid_Lock);
- DeleteOplockQEntry(oplock_item);
- /* can not grab inode sem here since it would
+ struct oplock_q_entry, qhead);
+ cFYI(1, ("found oplock item to write out"));
+ pTcon = oplock_item->tcon;
+ inode = oplock_item->pinode;
+ netfid = oplock_item->netfid;
+ spin_unlock(&GlobalMid_Lock);
+ DeleteOplockQEntry(oplock_item);
+ /* can not grab inode sem here since it would
deadlock when oplock received on delete
since vfs_unlink holds the i_mutex across
the call */
- /* mutex_lock(&inode->i_mutex);*/
- if (S_ISREG(inode->i_mode)) {
- rc =
- filemap_fdatawrite(inode->i_mapping);
- if (CIFS_I(inode)->clientCanCacheRead
- == 0) {
- waitrc = filemap_fdatawait(inode->i_mapping);
- invalidate_remote_inode(inode);
- }
- if (rc == 0)
- rc = waitrc;
- } else
- rc = 0;
- /* mutex_unlock(&inode->i_mutex);*/
- if (rc)
- CIFS_I(inode)->write_behind_rc = rc;
- cFYI(1, ("Oplock flush inode %p rc %d",
- inode, rc));
+ /* mutex_lock(&inode->i_mutex);*/
+ if (S_ISREG(inode->i_mode)) {
+ rc = filemap_fdatawrite(inode->i_mapping);
+ if (CIFS_I(inode)->clientCanCacheRead == 0) {
+ waitrc = filemap_fdatawait(
+ inode->i_mapping);
+ invalidate_remote_inode(inode);
+ }
+ if (rc == 0)
+ rc = waitrc;
+ } else
+ rc = 0;
+ /* mutex_unlock(&inode->i_mutex);*/
+ if (rc)
+ CIFS_I(inode)->write_behind_rc = rc;
+ cFYI(1, ("Oplock flush inode %p rc %d",
+ inode, rc));
/* releasing stale oplock after recent reconnect
of smb session using a now incorrect file
@@ -967,15 +967,13 @@ static int cifs_oplock_thread(void *dummyarg)
not bother sending an oplock release if session
to server still is disconnected since oplock
already released by the server in that case */
- if (pTcon->tidStatus != CifsNeedReconnect) {
- rc = CIFSSMBLock(0, pTcon, netfid,
- 0 /* len */ , 0 /* offset */, 0,
- 0, LOCKING_ANDX_OPLOCK_RELEASE,
- false /* wait flag */);
- cFYI(1, ("Oplock release rc = %d", rc));
- }
- } else
- spin_unlock(&GlobalMid_Lock);
+ if (pTcon->tidStatus != CifsNeedReconnect) {
+ rc = CIFSSMBLock(0, pTcon, netfid,
+ 0 /* len */ , 0 /* offset */, 0,
+ 0, LOCKING_ANDX_OPLOCK_RELEASE,
+ false /* wait flag */);
+ cFYI(1, ("Oplock release rc = %d", rc));
+ }
set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(1); /* yield in case q were corrupt */
}
@@ -1001,8 +999,7 @@ static int cifs_dnotify_thread(void *dummyarg)
list_for_each(tmp, &GlobalSMBSessionList) {
ses = list_entry(tmp, struct cifsSesInfo,
cifsSessionList);
- if (ses && ses->server &&
- atomic_read(&ses->server->inFlight))
+ if (ses->server && atomic_read(&ses->server->inFlight))
wake_up_all(&ses->server->response_q);
}
read_unlock(&GlobalSMBSeslock);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 25a6cbd1552..f7b4a5cd837 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -41,7 +41,7 @@ extern int cifs_create(struct inode *, struct dentry *, int,
struct nameidata *);
extern struct dentry *cifs_lookup(struct inode *, struct dentry *,
struct nameidata *);
-extern int cifs_unlink(struct inode *, struct dentry *);
+extern int cifs_unlink(struct inode *dir, struct dentry *dentry);
extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *);
extern int cifs_mknod(struct inode *, struct dentry *, int, dev_t);
extern int cifs_mkdir(struct inode *, struct dentry *, int);
@@ -101,5 +101,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
extern const struct export_operations cifs_export_ops;
#endif /* EXPERIMENTAL */
-#define CIFS_VERSION "1.53"
+#define CIFS_VERSION "1.54"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 9cfcf326ead..0d22479d99b 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -27,7 +27,7 @@
#define MAX_SES_INFO 2
#define MAX_TCON_INFO 4
-#define MAX_TREE_SIZE 2 + MAX_SERVER_SIZE + 1 + MAX_SHARE_SIZE + 1
+#define MAX_TREE_SIZE (2 + MAX_SERVER_SIZE + 1 + MAX_SHARE_SIZE + 1)
#define MAX_SERVER_SIZE 15
#define MAX_SHARE_SIZE 64 /* used to be 20, this should still be enough */
#define MAX_USERNAME_SIZE 32 /* 32 is to allow for 15 char names + null
@@ -80,7 +80,8 @@ enum securityEnum {
NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */
RawNTLMSSP, /* NTLMSSP without SPNEGO */
NTLMSSP, /* NTLMSSP via SPNEGO */
- Kerberos /* Kerberos via SPNEGO */
+ Kerberos, /* Kerberos via SPNEGO */
+ MSKerberos, /* MS Kerberos via SPNEGO */
};
enum protocolEnum {
@@ -308,6 +309,7 @@ struct cifs_search_info {
__u32 resume_key;
char *ntwrk_buf_start;
char *srch_entries_start;
+ char *last_entry;
char *presume_name;
unsigned int resume_name_len;
bool endOfSearch:1;
@@ -537,8 +539,8 @@ require use of the stronger protocol */
#endif /* WEAK_PW_HASH */
#define CIFSSEC_MUST_SEAL 0x40040 /* not supported yet */
-#define CIFSSEC_DEF CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2
-#define CIFSSEC_MAX CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2
+#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2)
+#define CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2)
#define CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5)
/*
*****************************************************************
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 0f327c224da..d2a073edd1b 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -31,7 +31,7 @@
#else
#define CIFS_PROT 0
#endif
-#define POSIX_PROT CIFS_PROT+1
+#define POSIX_PROT (CIFS_PROT+1)
#define BAD_PROT 0xFFFF
/* SMB command codes */
@@ -262,7 +262,7 @@
*/
#define CIFS_NO_HANDLE 0xFFFF
-#define NO_CHANGE_64 cpu_to_le64(0xFFFFFFFFFFFFFFFFULL)
+#define NO_CHANGE_64 0xFFFFFFFFFFFFFFFFULL
#define NO_CHANGE_32 0xFFFFFFFFUL
/* IPC$ in ASCII */
@@ -341,7 +341,7 @@
#define CREATE_COMPLETE_IF_OPLK 0x00000100 /* should be zero */
#define CREATE_NO_EA_KNOWLEDGE 0x00000200
#define CREATE_EIGHT_DOT_THREE 0x00000400 /* doc says this is obsolete
- "open for recovery" flag - should
+ "open for recovery" flag should
be zero in any case */
#define CREATE_OPEN_FOR_RECOVERY 0x00000400
#define CREATE_RANDOM_ACCESS 0x00000800
@@ -414,8 +414,8 @@ struct smb_hdr {
__u8 WordCount;
} __attribute__((packed));
/* given a pointer to an smb_hdr retrieve the value of byte count */
-#define BCC(smb_var) ( *(__u16 *)((char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount)))
-#define BCC_LE(smb_var) ( *(__le16 *)((char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount)))
+#define BCC(smb_var) (*(__u16 *)((char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount)))
+#define BCC_LE(smb_var) (*(__le16 *)((char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount)))
/* given a pointer to an smb_hdr retrieve the pointer to the byte area */
#define pByteArea(smb_var) ((unsigned char *)smb_var + sizeof(struct smb_hdr) + (2 * smb_var->WordCount) + 2)
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index b9f5e935f82..0cff7fe986e 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -172,12 +172,15 @@ extern int CIFSSMBQFSUnixInfo(const int xid, struct cifsTconInfo *tcon);
extern int CIFSSMBQFSPosixInfo(const int xid, struct cifsTconInfo *tcon,
struct kstatfs *FSData);
-extern int CIFSSMBSetTimes(const int xid, struct cifsTconInfo *tcon,
+extern int CIFSSMBSetPathInfo(const int xid, struct cifsTconInfo *tcon,
const char *fileName, const FILE_BASIC_INFO *data,
const struct nls_table *nls_codepage,
int remap_special_chars);
-extern int CIFSSMBSetFileTimes(const int xid, struct cifsTconInfo *tcon,
- const FILE_BASIC_INFO *data, __u16 fid);
+extern int CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
+ const FILE_BASIC_INFO *data, __u16 fid,
+ __u32 pid_of_opener);
+extern int CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
+ bool delete_file, __u16 fid, __u32 pid_of_opener);
#if 0
extern int CIFSSMBSetAttrLegacy(int xid, struct cifsTconInfo *tcon,
char *fileName, __u16 dos_attributes,
@@ -191,9 +194,20 @@ extern int CIFSSMBSetEOF(const int xid, struct cifsTconInfo *tcon,
extern int CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon,
__u64 size, __u16 fileHandle, __u32 opener_pid,
bool AllocSizeFlag);
-extern int CIFSSMBUnixSetPerms(const int xid, struct cifsTconInfo *pTcon,
- char *full_path, __u64 mode, __u64 uid,
- __u64 gid, dev_t dev,
+
+struct cifs_unix_set_info_args {
+ __u64 ctime;
+ __u64 atime;
+ __u64 mtime;
+ __u64 mode;
+ __u64 uid;
+ __u64 gid;
+ dev_t device;
+};
+
+extern int CIFSSMBUnixSetInfo(const int xid, struct cifsTconInfo *pTcon,
+ char *fileName,
+ const struct cifs_unix_set_info_args *args,
const struct nls_table *nls_codepage,
int remap_special_chars);
@@ -217,7 +231,7 @@ extern int CIFSSMBRename(const int xid, struct cifsTconInfo *tcon,
const struct nls_table *nls_codepage,
int remap_special_chars);
extern int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
- int netfid, char *target_name,
+ int netfid, const char *target_name,
const struct nls_table *nls_codepage,
int remap_special_chars);
extern int CIFSCreateHardLink(const int xid,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 4511b708f0f..6f4ffe15d68 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -128,8 +128,7 @@ static void mark_open_files_invalid(struct cifsTconInfo *pTcon)
write_lock(&GlobalSMBSeslock);
list_for_each_safe(tmp, tmp1, &pTcon->openFileList) {
open_file = list_entry(tmp, struct cifsFileInfo, tlist);
- if (open_file)
- open_file->invalidHandle = true;
+ open_file->invalidHandle = true;
}
write_unlock(&GlobalSMBSeslock);
/* BB Add call to invalidate_inodes(sb) for all superblocks mounted
@@ -686,11 +685,10 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
SecurityBlob,
count - 16,
&server->secType);
- if (rc == 1) {
+ if (rc == 1)
rc = 0;
- } else {
+ else
rc = -EINVAL;
- }
}
} else
server->capabilities &= ~CAP_EXTENDED_SECURITY;
@@ -2019,7 +2017,7 @@ renameRetry:
}
int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
- int netfid, char *target_name,
+ int netfid, const char *target_name,
const struct nls_table *nls_codepage, int remap)
{
struct smb_com_transaction2_sfi_req *pSMB = NULL;
@@ -2073,7 +2071,7 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon,
remap);
}
rename_info->target_name_len = cpu_to_le32(2 * len_of_str);
- count = 12 /* sizeof(struct set_file_rename) */ + (2 * len_of_str) + 2;
+ count = 12 /* sizeof(struct set_file_rename) */ + (2 * len_of_str);
byte_count += count;
pSMB->DataCount = cpu_to_le16(count);
pSMB->TotalDataCount = pSMB->DataCount;
@@ -3616,6 +3614,8 @@ findFirstRetry:
/* BB remember to free buffer if error BB */
rc = validate_t2((struct smb_t2_rsp *)pSMBr);
if (rc == 0) {
+ unsigned int lnoff;
+
if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE)
psrch_inf->unicode = true;
else
@@ -3638,6 +3638,17 @@ findFirstRetry:
le16_to_cpu(parms->SearchCount);
psrch_inf->index_of_last_entry = 2 /* skip . and .. */ +
psrch_inf->entries_in_buffer;
+ lnoff = le16_to_cpu(parms->LastNameOffset);
+ if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
+ lnoff) {
+ cERROR(1, ("ignoring corrupt resume name"));
+ psrch_inf->last_entry = NULL;
+ return rc;
+ }
+
+ psrch_inf->last_entry = psrch_inf->srch_entries_start +
+ lnoff;
+
*pnetfid = parms->SearchHandle;
} else {
cifs_buf_release(pSMB);
@@ -3727,6 +3738,8 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
rc = validate_t2((struct smb_t2_rsp *)pSMBr);
if (rc == 0) {
+ unsigned int lnoff;
+
/* BB fixme add lock for file (srch_info) struct here */
if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE)
psrch_inf->unicode = true;
@@ -3753,6 +3766,16 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
le16_to_cpu(parms->SearchCount);
psrch_inf->index_of_last_entry +=
psrch_inf->entries_in_buffer;
+ lnoff = le16_to_cpu(parms->LastNameOffset);
+ if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE <
+ lnoff) {
+ cERROR(1, ("ignoring corrupt resume name"));
+ psrch_inf->last_entry = NULL;
+ return rc;
+ } else
+ psrch_inf->last_entry =
+ psrch_inf->srch_entries_start + lnoff;
+
/* cFYI(1,("fnxt2 entries in buf %d index_of_last %d",
psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry)); */
@@ -3914,7 +3937,10 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
bool is_unicode;
struct dfs_referral_level_3 *ref;
- is_unicode = pSMBr->hdr.Flags2 & SMBFLG2_UNICODE;
+ if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE)
+ is_unicode = true;
+ else
+ is_unicode = false;
*num_of_nodes = le16_to_cpu(pSMBr->NumberOfReferrals);
if (*num_of_nodes < 1) {
@@ -4814,8 +4840,8 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
time and resort to the original setpathinfo level which takes the ancient
DOS time format with 2 second granularity */
int
-CIFSSMBSetFileTimes(const int xid, struct cifsTconInfo *tcon,
- const FILE_BASIC_INFO *data, __u16 fid)
+CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon,
+ const FILE_BASIC_INFO *data, __u16 fid, __u32 pid_of_opener)
{
struct smb_com_transaction2_sfi_req *pSMB = NULL;
char *data_offset;
@@ -4828,11 +4854,8 @@ CIFSSMBSetFileTimes(const int xid, struct cifsTconInfo *tcon,
if (rc)
return rc;
- /* At this point there is no need to override the current pid
- with the pid of the opener, but that could change if we someday
- use an existing handle (rather than opening one on the fly) */
- /* pSMB->hdr.Pid = cpu_to_le16((__u16)pid_of_opener);
- pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid_of_opener >> 16));*/
+ pSMB->hdr.Pid = cpu_to_le16((__u16)pid_of_opener);
+ pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid_of_opener >> 16));
params = 6;
pSMB->MaxSetupCount = 0;
@@ -4878,11 +4901,66 @@ CIFSSMBSetFileTimes(const int xid, struct cifsTconInfo *tcon,
return rc;
}
+int
+CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon,
+ bool delete_file, __u16 fid, __u32 pid_of_opener)
+{
+ struct smb_com_transaction2_sfi_req *pSMB = NULL;
+ char *data_offset;
+ int rc = 0;
+ __u16 params, param_offset, offset, byte_count, count;
+
+ cFYI(1, ("Set File Disposition (via SetFileInfo)"));
+ rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
+
+ if (rc)
+ return rc;
+
+ pSMB->hdr.Pid = cpu_to_le16((__u16)pid_of_opener);
+ pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid_of_opener >> 16));
+
+ params = 6;
+ pSMB->MaxSetupCount = 0;
+ pSMB->Reserved = 0;
+ pSMB->Flags = 0;
+ pSMB->Timeout = 0;
+ pSMB->Reserved2 = 0;
+ param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4;
+ offset = param_offset + params;
+
+ data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
+
+ count = 1;
+ pSMB->MaxParameterCount = cpu_to_le16(2);
+ /* BB find max SMB PDU from sess */
+ pSMB->MaxDataCount = cpu_to_le16(1000);
+ pSMB->SetupCount = 1;
+ pSMB->Reserved3 = 0;
+ pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION);
+ byte_count = 3 /* pad */ + params + count;
+ pSMB->DataCount = cpu_to_le16(count);
+ pSMB->ParameterCount = cpu_to_le16(params);
+ pSMB->TotalDataCount = pSMB->DataCount;
+ pSMB->TotalParameterCount = pSMB->ParameterCount;
+ pSMB->ParameterOffset = cpu_to_le16(param_offset);
+ pSMB->DataOffset = cpu_to_le16(offset);
+ pSMB->Fid = fid;
+ pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_DISPOSITION_INFO);
+ pSMB->Reserved4 = 0;
+ pSMB->hdr.smb_buf_length += byte_count;
+ pSMB->ByteCount = cpu_to_le16(byte_count);
+ *data_offset = delete_file ? 1 : 0;
+ rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
+ if (rc)
+ cFYI(1, ("Send error in SetFileDisposition = %d", rc));
+
+ return rc;
+}
int
-CIFSSMBSetTimes(const int xid, struct cifsTconInfo *tcon, const char *fileName,
- const FILE_BASIC_INFO *data,
- const struct nls_table *nls_codepage, int remap)
+CIFSSMBSetPathInfo(const int xid, struct cifsTconInfo *tcon,
+ const char *fileName, const FILE_BASIC_INFO *data,
+ const struct nls_table *nls_codepage, int remap)
{
TRANSACTION2_SPI_REQ *pSMB = NULL;
TRANSACTION2_SPI_RSP *pSMBr = NULL;
@@ -5011,10 +5089,9 @@ SetAttrLgcyRetry:
#endif /* temporarily unneeded SetAttr legacy function */
int
-CIFSSMBUnixSetPerms(const int xid, struct cifsTconInfo *tcon,
- char *fileName, __u64 mode, __u64 uid, __u64 gid,
- dev_t device, const struct nls_table *nls_codepage,
- int remap)
+CIFSSMBUnixSetInfo(const int xid, struct cifsTconInfo *tcon, char *fileName,
+ const struct cifs_unix_set_info_args *args,
+ const struct nls_table *nls_codepage, int remap)
{
TRANSACTION2_SPI_REQ *pSMB = NULL;
TRANSACTION2_SPI_RSP *pSMBr = NULL;
@@ -5023,6 +5100,7 @@ CIFSSMBUnixSetPerms(const int xid, struct cifsTconInfo *tcon,
int bytes_returned = 0;
FILE_UNIX_BASIC_INFO *data_offset;
__u16 params, param_offset, offset, count, byte_count;
+ __u64 mode = args->mode;
cFYI(1, ("In SetUID/GID/Mode"));
setPermsRetry:
@@ -5078,16 +5156,16 @@ setPermsRetry:
set file size and do not want to truncate file size to zero
accidently as happened on one Samba server beta by putting
zero instead of -1 here */
- data_offset->EndOfFile = NO_CHANGE_64;
- data_offset->NumOfBytes = NO_CHANGE_64;
- data_offset->LastStatusChange = NO_CHANGE_64;
- data_offset->LastAccessTime = NO_CHANGE_64;
- data_offset->LastModificationTime = NO_CHANGE_64;
- data_offset->Uid = cpu_to_le64(uid);
- data_offset->Gid = cpu_to_le64(gid);
+ data_offset->EndOfFile = cpu_to_le64(NO_CHANGE_64);
+ data_offset->NumOfBytes = cpu_to_le64(NO_CHANGE_64);
+ data_offset->LastStatusChange = cpu_to_le64(args->ctime);
+ data_offset->LastAccessTime = cpu_to_le64(args->atime);
+ data_offset->LastModificationTime = cpu_to_le64(args->mtime);
+ data_offset->Uid = cpu_to_le64(args->uid);
+ data_offset->Gid = cpu_to_le64(args->gid);
/* better to leave device as zero when it is */
- data_offset->DevMajor = cpu_to_le64(MAJOR(device));
- data_offset->DevMinor = cpu_to_le64(MINOR(device));
+ data_offset->DevMajor = cpu_to_le64(MAJOR(args->device));
+ data_offset->DevMinor = cpu_to_le64(MINOR(args->device));
data_offset->Permissions = cpu_to_le64(mode);
if (S_ISREG(mode))
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index e8fa46c7cff..4c13bcdb92a 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -151,7 +151,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
}
list_for_each(tmp, &GlobalTreeConnectionList) {
tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList);
- if ((tcon) && (tcon->ses) && (tcon->ses->server == server))
+ if ((tcon->ses) && (tcon->ses->server == server))
tcon->tidStatus = CifsNeedReconnect;
}
read_unlock(&GlobalSMBSeslock);
@@ -173,14 +173,12 @@ cifs_reconnect(struct TCP_Server_Info *server)
mid_entry = list_entry(tmp, struct
mid_q_entry,
qhead);
- if (mid_entry) {
- if (mid_entry->midState == MID_REQUEST_SUBMITTED) {
+ if (mid_entry->midState == MID_REQUEST_SUBMITTED) {
/* Mark other intransit requests as needing
retry so we do not immediately mark the
session bad again (ie after we reconnect
below) as they timeout too */
- mid_entry->midState = MID_RETRY_NEEDED;
- }
+ mid_entry->midState = MID_RETRY_NEEDED;
}
}
spin_unlock(&GlobalMid_Lock);
@@ -351,11 +349,9 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
current->flags |= PF_MEMALLOC;
cFYI(1, ("Demultiplex PID: %d", task_pid_nr(current)));
- write_lock(&GlobalSMBSeslock);
- atomic_inc(&tcpSesAllocCount);
- length = tcpSesAllocCount.counter;
- write_unlock(&GlobalSMBSeslock);
- if (length > 1)
+
+ length = atomic_inc_return(&tcpSesAllocCount);
+ if (length > 1)
mempool_resize(cifs_req_poolp, length + cifs_min_rcv,
GFP_KERNEL);
@@ -455,7 +451,7 @@ incomplete_rcv:
/* Note that FC 1001 length is big endian on the wire,
but we convert it here so it is always manipulated
as host byte order */
- pdu_length = ntohl(smb_buffer->smb_buf_length);
+ pdu_length = be32_to_cpu((__force __be32)smb_buffer->smb_buf_length);
smb_buffer->smb_buf_length = pdu_length;
cFYI(1, ("rfc1002 length 0x%x", pdu_length+4));
@@ -745,14 +741,11 @@ multi_t2_fnd:
coming home not much else we can do but free the memory */
}
- write_lock(&GlobalSMBSeslock);
- atomic_dec(&tcpSesAllocCount);
- length = tcpSesAllocCount.counter;
-
/* last chance to mark ses pointers invalid
if there are any pointing to this (e.g
if a crazy root user tried to kill cifsd
kernel thread explicitly this might happen) */
+ write_lock(&GlobalSMBSeslock);
list_for_each(tmp, &GlobalSMBSessionList) {
ses = list_entry(tmp, struct cifsSesInfo,
cifsSessionList);
@@ -763,6 +756,8 @@ multi_t2_fnd:
kfree(server->hostname);
kfree(server);
+
+ length = atomic_dec_return(&tcpSesAllocCount);
if (length > 0)
mempool_resize(cifs_req_poolp, length + cifs_min_rcv,
GFP_KERNEL);
@@ -1461,6 +1456,39 @@ get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
return rc;
}
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static struct lock_class_key cifs_key[2];
+static struct lock_class_key cifs_slock_key[2];
+
+static inline void
+cifs_reclassify_socket4(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ BUG_ON(sock_owned_by_user(sk));
+ sock_lock_init_class_and_name(sk, "slock-AF_INET-CIFS",
+ &cifs_slock_key[0], "sk_lock-AF_INET-CIFS", &cifs_key[0]);
+}
+
+static inline void
+cifs_reclassify_socket6(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ BUG_ON(sock_owned_by_user(sk));
+ sock_lock_init_class_and_name(sk, "slock-AF_INET6-CIFS",
+ &cifs_slock_key[1], "sk_lock-AF_INET6-CIFS", &cifs_key[1]);
+}
+#else
+static inline void
+cifs_reclassify_socket4(struct socket *sock)
+{
+}
+
+static inline void
+cifs_reclassify_socket6(struct socket *sock)
+{
+}
+#endif
+
/* See RFC1001 section 14 on representation of Netbios names */
static void rfc1002mangle(char *target, char *source, unsigned int length)
{
@@ -1495,6 +1523,7 @@ ipv4_connect(struct sockaddr_in *psin_server, struct socket **csocket,
/* BB other socket options to set KEEPALIVE, NODELAY? */
cFYI(1, ("Socket created"));
(*csocket)->sk->sk_allocation = GFP_NOFS;
+ cifs_reclassify_socket4(*csocket);
}
}
@@ -1627,6 +1656,7 @@ ipv6_connect(struct sockaddr_in6 *psin_server, struct socket **csocket)
/* BB other socket options to set KEEPALIVE, NODELAY? */
cFYI(1, ("ipv6 Socket created"));
(*csocket)->sk->sk_allocation = GFP_NOFS;
+ cifs_reclassify_socket6(*csocket);
}
}
@@ -3568,19 +3598,21 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
char ntlm_session_key[CIFS_SESS_KEY_SIZE];
bool ntlmv2_flag = false;
int first_time = 0;
+ struct TCP_Server_Info *server = pSesInfo->server;
/* what if server changes its buffer size after dropping the session? */
- if (pSesInfo->server->maxBuf == 0) /* no need to send on reconnect */ {
+ if (server->maxBuf == 0) /* no need to send on reconnect */ {
rc = CIFSSMBNegotiate(xid, pSesInfo);
- if (rc == -EAGAIN) /* retry only once on 1st time connection */ {
+ if (rc == -EAGAIN) {
+ /* retry only once on 1st time connection */
rc = CIFSSMBNegotiate(xid, pSesInfo);
if (rc == -EAGAIN)
rc = -EHOSTDOWN;
}
if (rc == 0) {
spin_lock(&GlobalMid_Lock);
- if (pSesInfo->server->tcpStatus != CifsExiting)
- pSesInfo->server->tcpStatus = CifsGood;
+ if (server->tcpStatus != CifsExiting)
+ server->tcpStatus = CifsGood;
else
rc = -EHOSTDOWN;
spin_unlock(&GlobalMid_Lock);
@@ -3588,97 +3620,90 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo,
}
first_time = 1;
}
- if (!rc) {
- pSesInfo->flags = 0;
- pSesInfo->capabilities = pSesInfo->server->capabilities;
- if (linuxExtEnabled == 0)
- pSesInfo->capabilities &= (~CAP_UNIX);
+
+ if (rc)
+ goto ss_err_exit;
+
+ pSesInfo->flags = 0;
+ pSesInfo->capabilities = server->capabilities;
+ if (linuxExtEnabled == 0)
+ pSesInfo->capabilities &= (~CAP_UNIX);
/* pSesInfo->sequence_number = 0;*/
- cFYI(1,
- ("Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d",
- pSesInfo->server->secMode,
- pSesInfo->server->capabilities,
- pSesInfo->server->timeAdj));
- if (experimEnabled < 2)
- rc = CIFS_SessSetup(xid, pSesInfo,
- first_time, nls_info);
- else if (extended_security
- && (pSesInfo->capabilities
- & CAP_EXTENDED_SECURITY)
- && (pSesInfo->server->secType == NTLMSSP)) {
- rc = -EOPNOTSUPP;
- } else if (extended_security
- && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY)
- && (pSesInfo->server->secType == RawNTLMSSP)) {
- cFYI(1, ("NTLMSSP sesssetup"));
- rc = CIFSNTLMSSPNegotiateSessSetup(xid,
- pSesInfo,
- &ntlmv2_flag,
- nls_info);
- if (!rc) {
- if (ntlmv2_flag) {
- char *v2_response;
- cFYI(1, ("more secure NTLM ver2 hash"));
- if (CalcNTLMv2_partial_mac_key(pSesInfo,
- nls_info)) {
- rc = -ENOMEM;
- goto ss_err_exit;
- } else
- v2_response = kmalloc(16 + 64 /* blob */, GFP_KERNEL);
- if (v2_response) {
- CalcNTLMv2_response(pSesInfo,
- v2_response);
- /* if (first_time)
- cifs_calculate_ntlmv2_mac_key(
- pSesInfo->server->mac_signing_key,
- response, ntlm_session_key,*/
- kfree(v2_response);
+ cFYI(1, ("Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d",
+ server->secMode, server->capabilities, server->timeAdj));
+
+ if (experimEnabled < 2)
+ rc = CIFS_SessSetup(xid, pSesInfo, first_time, nls_info);
+ else if (extended_security
+ && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY)
+ && (server->secType == NTLMSSP)) {
+ rc = -EOPNOTSUPP;
+ } else if (extended_security
+ && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY)
+ && (server->secType == RawNTLMSSP)) {
+ cFYI(1, ("NTLMSSP sesssetup"));
+ rc = CIFSNTLMSSPNegotiateSessSetup(xid, pSesInfo, &ntlmv2_flag,
+ nls_info);
+ if (!rc) {
+ if (ntlmv2_flag) {
+ char *v2_response;
+ cFYI(1, ("more secure NTLM ver2 hash"));
+ if (CalcNTLMv2_partial_mac_key(pSesInfo,
+ nls_info)) {
+ rc = -ENOMEM;
+ goto ss_err_exit;
+ } else
+ v2_response = kmalloc(16 + 64 /* blob*/,
+ GFP_KERNEL);
+ if (v2_response) {
+ CalcNTLMv2_response(pSesInfo,
+ v2_response);
+ /* if (first_time)
+ cifs_calculate_ntlmv2_mac_key */
+ kfree(v2_response);
/* BB Put dummy sig in SessSetup PDU? */
- } else {
- rc = -ENOMEM;
- goto ss_err_exit;
- }
-
} else {
- SMBNTencrypt(pSesInfo->password,
- pSesInfo->server->cryptKey,
- ntlm_session_key);
-
- if (first_time)
- cifs_calculate_mac_key(
- &pSesInfo->server->mac_signing_key,
- ntlm_session_key,
- pSesInfo->password);
+ rc = -ENOMEM;
+ goto ss_err_exit;
}
+
+ } else {
+ SMBNTencrypt(pSesInfo->password,
+ server->cryptKey,
+ ntlm_session_key);
+
+ if (first_time)
+ cifs_calculate_mac_key(
+ &server->mac_signing_key,
+ ntlm_session_key,
+ pSesInfo->password);
+ }
/* for better security the weaker lanman hash not sent
in AuthSessSetup so we no longer calculate it */
- rc = CIFSNTLMSSPAuthSessSetup(xid,
- pSesInfo,
- ntlm_session_key,
- ntlmv2_flag,
- nls_info);
- }
- } else { /* old style NTLM 0.12 session setup */
- SMBNTencrypt(pSesInfo->password,
- pSesInfo->server->cryptKey,
- ntlm_session_key);
+ rc = CIFSNTLMSSPAuthSessSetup(xid, pSesInfo,
+ ntlm_session_key,
+ ntlmv2_flag,
+ nls_info);
+ }
+ } else { /* old style NTLM 0.12 session setup */
+ SMBNTencrypt(pSesInfo->password, server->cryptKey,
+ ntlm_session_key);
- if (first_time)
- cifs_calculate_mac_key(
- &pSesInfo->server->mac_signing_key,
- ntlm_session_key, pSesInfo->password);
+ if (first_time)
+ cifs_calculate_mac_key(&server->mac_signing_key,
+ ntlm_session_key,
+ pSesInfo->password);
- rc = CIFSSessSetup(xid, pSesInfo,
- ntlm_session_key, nls_info);
- }
- if (rc) {
- cERROR(1, ("Send error in SessSetup = %d", rc));
- } else {
- cFYI(1, ("CIFS Session Established successfully"));
+ rc = CIFSSessSetup(xid, pSesInfo, ntlm_session_key, nls_info);
+ }
+ if (rc) {
+ cERROR(1, ("Send error in SessSetup = %d", rc));
+ } else {
+ cFYI(1, ("CIFS Session Established successfully"));
pSesInfo->status = CifsGood;
- }
}
+
ss_err_exit:
return rc;
}
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index fb69c1fa85c..e962e75e6f7 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -226,23 +226,28 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
/* If Open reported that we actually created a file
then we now have to set the mode if possible */
if ((pTcon->unix_ext) && (oplock & CIFS_CREATE_ACTION)) {
+ struct cifs_unix_set_info_args args = {
+ .mode = mode,
+ .ctime = NO_CHANGE_64,
+ .atime = NO_CHANGE_64,
+ .mtime = NO_CHANGE_64,
+ .device = 0,
+ };
+
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
- CIFSSMBUnixSetPerms(xid, pTcon, full_path, mode,
- (__u64)current->fsuid,
- (__u64)current->fsgid,
- 0 /* dev */,
- cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ args.uid = (__u64) current->fsuid;
+ if (inode->i_mode & S_ISGID)
+ args.gid = (__u64) inode->i_gid;
+ else
+ args.gid = (__u64) current->fsgid;
} else {
- CIFSSMBUnixSetPerms(xid, pTcon, full_path, mode,
- (__u64)-1,
- (__u64)-1,
- 0 /* dev */,
- cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ args.uid = NO_CHANGE_64;
+ args.gid = NO_CHANGE_64;
}
+ CIFSSMBUnixSetInfo(xid, pTcon, full_path, &args,
+ cifs_sb->local_nls,
+ cifs_sb->mnt_cifs_flags &
+ CIFS_MOUNT_MAP_SPECIAL_CHR);
} else {
/* BB implement mode setting via Windows security
descriptors e.g. */
@@ -267,7 +272,12 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
(cifs_sb->mnt_cifs_flags &
CIFS_MOUNT_SET_UID)) {
newinode->i_uid = current->fsuid;
- newinode->i_gid = current->fsgid;
+ if (inode->i_mode & S_ISGID)
+ newinode->i_gid =
+ inode->i_gid;
+ else
+ newinode->i_gid =
+ current->fsgid;
}
}
}
@@ -357,21 +367,24 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
if (full_path == NULL)
rc = -ENOMEM;
else if (pTcon->unix_ext) {
- mode &= ~current->fs->umask;
+ struct cifs_unix_set_info_args args = {
+ .mode = mode & ~current->fs->umask,
+ .ctime = NO_CHANGE_64,
+ .atime = NO_CHANGE_64,
+ .mtime = NO_CHANGE_64,
+ .device = device_number,
+ };
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
- rc = CIFSSMBUnixSetPerms(xid, pTcon, full_path,
- mode, (__u64)current->fsuid,
- (__u64)current->fsgid,
- device_number, cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ args.uid = (__u64) current->fsuid;
+ args.gid = (__u64) current->fsgid;
} else {
- rc = CIFSSMBUnixSetPerms(xid, pTcon,
- full_path, mode, (__u64)-1, (__u64)-1,
- device_number, cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ args.uid = NO_CHANGE_64;
+ args.gid = NO_CHANGE_64;
}
+ rc = CIFSSMBUnixSetInfo(xid, pTcon, full_path,
+ &args, cifs_sb->local_nls,
+ cifs_sb->mnt_cifs_flags &
+ CIFS_MOUNT_MAP_SPECIAL_CHR);
if (!rc) {
rc = cifs_get_inode_info_unix(&newinode, full_path,
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index f730ef35499..1e0c1bd8f2e 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -29,38 +29,13 @@
#include "cifsproto.h"
#include "cifs_debug.h"
-static int dns_resolver_instantiate(struct key *key, const void *data,
- size_t datalen)
-{
- int rc = 0;
- char *ip;
-
- ip = kmalloc(datalen+1, GFP_KERNEL);
- if (!ip)
- return -ENOMEM;
-
- memcpy(ip, data, datalen);
- ip[datalen] = '\0';
-
- rcu_assign_pointer(key->payload.data, ip);
-
- return rc;
-}
-
-struct key_type key_type_dns_resolver = {
- .name = "dns_resolver",
- .def_datalen = sizeof(struct in_addr),
- .describe = user_describe,
- .instantiate = dns_resolver_instantiate,
- .match = user_match,
-};
-
/* Checks if supplied name is IP address
* returns:
* 1 - name is IP
* 0 - name is not IP
*/
-static int is_ip(const char *name)
+static int
+is_ip(const char *name)
{
int rc;
struct sockaddr_in sin_server;
@@ -82,6 +57,47 @@ static int is_ip(const char *name)
return 0;
}
+static int
+dns_resolver_instantiate(struct key *key, const void *data,
+ size_t datalen)
+{
+ int rc = 0;
+ char *ip;
+
+ ip = kmalloc(datalen + 1, GFP_KERNEL);
+ if (!ip)
+ return -ENOMEM;
+
+ memcpy(ip, data, datalen);
+ ip[datalen] = '\0';
+
+ /* make sure this looks like an address */
+ if (!is_ip((const char *) ip)) {
+ kfree(ip);
+ return -EINVAL;
+ }
+
+ key->type_data.x[0] = datalen;
+ rcu_assign_pointer(key->payload.data, ip);
+
+ return rc;
+}
+
+static void
+dns_resolver_destroy(struct key *key)
+{
+ kfree(key->payload.data);
+}
+
+struct key_type key_type_dns_resolver = {
+ .name = "dns_resolver",
+ .def_datalen = sizeof(struct in_addr),
+ .describe = user_describe,
+ .instantiate = dns_resolver_instantiate,
+ .destroy = dns_resolver_destroy,
+ .match = user_match,
+};
+
/* Resolves server name to ip address.
* input:
* unc - server UNC
@@ -133,6 +149,7 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
rkey = request_key(&key_type_dns_resolver, name, "");
if (!IS_ERR(rkey)) {
+ len = rkey->type_data.x[0];
data = rkey->payload.data;
} else {
cERROR(1, ("%s: unable to resolve: %s", __func__, name));
@@ -141,11 +158,9 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
skip_upcall:
if (data) {
- len = strlen(data);
- *ip_addr = kmalloc(len+1, GFP_KERNEL);
+ *ip_addr = kmalloc(len + 1, GFP_KERNEL);
if (*ip_addr) {
- memcpy(*ip_addr, data, len);
- (*ip_addr)[len] = '\0';
+ memcpy(*ip_addr, data, len + 1);
if (!IS_ERR(rkey))
cFYI(1, ("%s: resolved: %s to %s", __func__,
name,
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 0aac824371a..c4a8a060512 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -107,7 +107,7 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
/* want handles we can use to read with first
in the list so we do not have to walk the
- list to search for one in prepare_write */
+ list to search for one in write_begin */
if ((file->f_flags & O_ACCMODE) == O_WRONLY) {
list_add_tail(&pCifsFile->flist,
&pCifsInode->openFileList);
@@ -310,18 +310,19 @@ int cifs_open(struct inode *inode, struct file *file)
/* time to set mode which we can not set earlier due to
problems creating new read-only files */
if (pTcon->unix_ext) {
- CIFSSMBUnixSetPerms(xid, pTcon, full_path,
- inode->i_mode,
- (__u64)-1, (__u64)-1, 0 /* dev */,
+ struct cifs_unix_set_info_args args = {
+ .mode = inode->i_mode,
+ .uid = NO_CHANGE_64,
+ .gid = NO_CHANGE_64,
+ .ctime = NO_CHANGE_64,
+ .atime = NO_CHANGE_64,
+ .mtime = NO_CHANGE_64,
+ .device = 0,
+ };
+ CIFSSMBUnixSetInfo(xid, pTcon, full_path, &args,
cifs_sb->local_nls,
cifs_sb->mnt_cifs_flags &
CIFS_MOUNT_MAP_SPECIAL_CHR);
- } else {
- /* BB implement via Windows security descriptors eg
- CIFSSMBWinSetPerms(xid, pTcon, full_path, mode,
- -1, -1, local_nls);
- in the meantime could set r/o dos attribute when
- perms are eg: mode & 0222 == 0 */
}
}
@@ -832,6 +833,10 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
return -EBADF;
open_file = (struct cifsFileInfo *) file->private_data;
+ rc = generic_write_checks(file, poffset, &write_size, 0);
+ if (rc)
+ return rc;
+
xid = GetXid();
if (*poffset > file->f_path.dentry->d_inode->i_size)
@@ -910,7 +915,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data,
}
static ssize_t cifs_write(struct file *file, const char *write_data,
- size_t write_size, loff_t *poffset)
+ size_t write_size, loff_t *poffset)
{
int rc = 0;
unsigned int bytes_written = 0;
@@ -1060,6 +1065,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode)
struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
{
struct cifsFileInfo *open_file;
+ bool any_available = false;
int rc;
/* Having a null inode here (because mapping->host was set to zero by
@@ -1075,8 +1081,10 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode)
read_lock(&GlobalSMBSeslock);
refind_writable:
list_for_each_entry(open_file, &cifs_inode->openFileList, flist) {
- if (open_file->closePend)
+ if (open_file->closePend ||
+ (!any_available && open_file->pid != current->tgid))
continue;
+
if (open_file->pfile &&
((open_file->pfile->f_flags & O_RDWR) ||
(open_file->pfile->f_flags & O_WRONLY))) {
@@ -1126,6 +1134,11 @@ refind_writable:
of the loop here. */
}
}
+ /* couldn't find useable FH with same pid, try any available */
+ if (!any_available) {
+ any_available = true;
+ goto refind_writable;
+ }
read_unlock(&GlobalSMBSeslock);
return NULL;
}
@@ -1280,7 +1293,7 @@ retry:
if (first < 0)
lock_page(page);
- else if (TestSetPageLocked(page))
+ else if (!trylock_page(page))
break;
if (unlikely(page->mapping != mapping)) {
@@ -1442,49 +1455,52 @@ static int cifs_writepage(struct page *page, struct writeback_control *wbc)
return rc;
}
-static int cifs_commit_write(struct file *file, struct page *page,
- unsigned offset, unsigned to)
+static int cifs_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
{
- int xid;
- int rc = 0;
- struct inode *inode = page->mapping->host;
- loff_t position = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
- char *page_data;
+ int rc;
+ struct inode *inode = mapping->host;
- xid = GetXid();
- cFYI(1, ("commit write for page %p up to position %lld for %d",
- page, position, to));
- spin_lock(&inode->i_lock);
- if (position > inode->i_size)
- i_size_write(inode, position);
+ cFYI(1, ("write_end for page %p from pos %lld with %d bytes",
+ page, pos, copied));
+
+ if (!PageUptodate(page) && copied == PAGE_CACHE_SIZE)
+ SetPageUptodate(page);
- spin_unlock(&inode->i_lock);
if (!PageUptodate(page)) {
- position = ((loff_t)page->index << PAGE_CACHE_SHIFT) + offset;
- /* can not rely on (or let) writepage write this data */
- if (to < offset) {
- cFYI(1, ("Illegal offsets, can not copy from %d to %d",
- offset, to));
- FreeXid(xid);
- return rc;
- }
+ char *page_data;
+ unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+ int xid;
+
+ xid = GetXid();
/* this is probably better than directly calling
partialpage_write since in this function the file handle is
known which we might as well leverage */
/* BB check if anything else missing out of ppw
such as updating last write time */
page_data = kmap(page);
- rc = cifs_write(file, page_data + offset, to-offset,
- &position);
- if (rc > 0)
- rc = 0;
- /* else if (rc < 0) should we set writebehind rc? */
+ rc = cifs_write(file, page_data + offset, copied, &pos);
+ /* if (rc < 0) should we set writebehind rc? */
kunmap(page);
+
+ FreeXid(xid);
} else {
+ rc = copied;
+ pos += copied;
set_page_dirty(page);
}
- FreeXid(xid);
+ if (rc > 0) {
+ spin_lock(&inode->i_lock);
+ if (pos > inode->i_size)
+ i_size_write(inode, pos);
+ spin_unlock(&inode->i_lock);
+ }
+
+ unlock_page(page);
+ page_cache_release(page);
+
return rc;
}
@@ -2030,49 +2046,44 @@ bool is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 end_of_file)
return true;
}
-static int cifs_prepare_write(struct file *file, struct page *page,
- unsigned from, unsigned to)
+static int cifs_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
{
- int rc = 0;
- loff_t i_size;
- loff_t offset;
+ pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ loff_t offset = pos & (PAGE_CACHE_SIZE - 1);
+
+ cFYI(1, ("write_begin from %lld len %d", (long long)pos, len));
- cFYI(1, ("prepare write for page %p from %d to %d", page, from, to));
- if (PageUptodate(page))
+ *pagep = __grab_cache_page(mapping, index);
+ if (!*pagep)
+ return -ENOMEM;
+
+ if (PageUptodate(*pagep))
return 0;
/* If we are writing a full page it will be up to date,
no need to read from the server */
- if ((to == PAGE_CACHE_SIZE) && (from == 0)) {
- SetPageUptodate(page);
+ if (len == PAGE_CACHE_SIZE && flags & AOP_FLAG_UNINTERRUPTIBLE)
return 0;
- }
- offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
- i_size = i_size_read(page->mapping->host);
+ if ((file->f_flags & O_ACCMODE) != O_WRONLY) {
+ int rc;
- if ((offset >= i_size) ||
- ((from == 0) && (offset + to) >= i_size)) {
- /*
- * We don't need to read data beyond the end of the file.
- * zero it, and set the page uptodate
- */
- simple_prepare_write(file, page, from, to);
- SetPageUptodate(page);
- } else if ((file->f_flags & O_ACCMODE) != O_WRONLY) {
/* might as well read a page, it is fast enough */
- rc = cifs_readpage_worker(file, page, &offset);
+ rc = cifs_readpage_worker(file, *pagep, &offset);
+
+ /* we do not need to pass errors back
+ e.g. if we do not have read access to the file
+ because cifs_write_end will attempt synchronous writes
+ -- shaggy */
} else {
/* we could try using another file handle if there is one -
but how would we lock it to prevent close of that handle
racing with this read? In any case
- this will be written out by commit_write so is fine */
+ this will be written out by write_end so is fine */
}
- /* we do not need to pass errors back
- e.g. if we do not have read access to the file
- because cifs_commit_write will do the right thing. -- shaggy */
-
return 0;
}
@@ -2081,8 +2092,8 @@ const struct address_space_operations cifs_addr_ops = {
.readpages = cifs_readpages,
.writepage = cifs_writepage,
.writepages = cifs_writepages,
- .prepare_write = cifs_prepare_write,
- .commit_write = cifs_commit_write,
+ .write_begin = cifs_write_begin,
+ .write_end = cifs_write_end,
.set_page_dirty = __set_page_dirty_nobuffers,
/* .sync_page = cifs_sync_page, */
/* .direct_IO = */
@@ -2097,8 +2108,8 @@ const struct address_space_operations cifs_addr_ops_smallbuf = {
.readpage = cifs_readpage,
.writepage = cifs_writepage,
.writepages = cifs_writepages,
- .prepare_write = cifs_prepare_write,
- .commit_write = cifs_commit_write,
+ .write_begin = cifs_write_begin,
+ .write_end = cifs_write_end,
.set_page_dirty = __set_page_dirty_nobuffers,
/* .sync_page = cifs_sync_page, */
/* .direct_IO = */
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 722be543cee..a8c833345fc 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -219,15 +219,15 @@ int cifs_get_inode_info_unix(struct inode **pinode,
rc = CIFSSMBUnixQPathInfo(xid, pTcon, full_path, &find_data,
cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
CIFS_MOUNT_MAP_SPECIAL_CHR);
- if (rc) {
- if (rc == -EREMOTE && !is_dfs_referral) {
- is_dfs_referral = true;
- cFYI(DBG2, ("DFS ref"));
- /* for DFS, server does not give us real inode data */
- fill_fake_finddataunix(&find_data, sb);
- rc = 0;
- }
- }
+ if (rc == -EREMOTE && !is_dfs_referral) {
+ is_dfs_referral = true;
+ cFYI(DBG2, ("DFS ref"));
+ /* for DFS, server does not give us real inode data */
+ fill_fake_finddataunix(&find_data, sb);
+ rc = 0;
+ } else if (rc)
+ goto cgiiu_exit;
+
num_of_bytes = le64_to_cpu(find_data.NumOfBytes);
end_of_file = le64_to_cpu(find_data.EndOfFile);
@@ -236,7 +236,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
*pinode = new_inode(sb);
if (*pinode == NULL) {
rc = -ENOMEM;
- goto cgiiu_exit;
+ goto cgiiu_exit;
}
/* Is an i_ino of zero legal? */
/* note ino incremented to unique num in new_inode */
@@ -546,7 +546,8 @@ int cifs_get_inode_info(struct inode **pinode,
if ((inode->i_mode & S_IWUGO) == 0 &&
(attr & ATTR_READONLY) == 0)
inode->i_mode |= (S_IWUGO & default_mode);
- inode->i_mode &= ~S_IFMT;
+
+ inode->i_mode &= ~S_IFMT;
}
/* clear write bits if ATTR_READONLY is set */
if (attr & ATTR_READONLY)
@@ -649,6 +650,7 @@ struct inode *cifs_iget(struct super_block *sb, unsigned long ino)
inode->i_fop = &simple_dir_operations;
inode->i_uid = cifs_sb->mnt_uid;
inode->i_gid = cifs_sb->mnt_gid;
+ } else if (rc) {
_FreeXid(xid);
iget_failed(inode);
return ERR_PTR(rc);
@@ -663,40 +665,201 @@ struct inode *cifs_iget(struct super_block *sb, unsigned long ino)
return inode;
}
-int cifs_unlink(struct inode *inode, struct dentry *direntry)
+static int
+cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
+ char *full_path, __u32 dosattr)
+{
+ int rc;
+ int oplock = 0;
+ __u16 netfid;
+ __u32 netpid;
+ bool set_time = false;
+ struct cifsFileInfo *open_file;
+ struct cifsInodeInfo *cifsInode = CIFS_I(inode);
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ struct cifsTconInfo *pTcon = cifs_sb->tcon;
+ FILE_BASIC_INFO info_buf;
+
+ if (attrs->ia_valid & ATTR_ATIME) {
+ set_time = true;
+ info_buf.LastAccessTime =
+ cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_atime));
+ } else
+ info_buf.LastAccessTime = 0;
+
+ if (attrs->ia_valid & ATTR_MTIME) {
+ set_time = true;
+ info_buf.LastWriteTime =
+ cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_mtime));
+ } else
+ info_buf.LastWriteTime = 0;
+
+ /*
+ * Samba throws this field away, but windows may actually use it.
+ * Do not set ctime unless other time stamps are changed explicitly
+ * (i.e. by utimes()) since we would then have a mix of client and
+ * server times.
+ */
+ if (set_time && (attrs->ia_valid & ATTR_CTIME)) {
+ cFYI(1, ("CIFS - CTIME changed"));
+ info_buf.ChangeTime =
+ cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime));
+ } else
+ info_buf.ChangeTime = 0;
+
+ info_buf.CreationTime = 0; /* don't change */
+ info_buf.Attributes = cpu_to_le32(dosattr);
+
+ /*
+ * If the file is already open for write, just use that fileid
+ */
+ open_file = find_writable_file(cifsInode);
+ if (open_file) {
+ netfid = open_file->netfid;
+ netpid = open_file->pid;
+ goto set_via_filehandle;
+ }
+
+ /*
+ * NT4 apparently returns success on this call, but it doesn't
+ * really work.
+ */
+ if (!(pTcon->ses->flags & CIFS_SES_NT4)) {
+ rc = CIFSSMBSetPathInfo(xid, pTcon, full_path,
+ &info_buf, cifs_sb->local_nls,
+ cifs_sb->mnt_cifs_flags &
+ CIFS_MOUNT_MAP_SPECIAL_CHR);
+ if (rc == 0) {
+ cifsInode->cifsAttrs = dosattr;
+ goto out;
+ } else if (rc != -EOPNOTSUPP && rc != -EINVAL)
+ goto out;
+ }
+
+ cFYI(1, ("calling SetFileInfo since SetPathInfo for "
+ "times not supported by this server"));
+ rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN,
+ SYNCHRONIZE | FILE_WRITE_ATTRIBUTES,
+ CREATE_NOT_DIR, &netfid, &oplock,
+ NULL, cifs_sb->local_nls,
+ cifs_sb->mnt_cifs_flags &
+ CIFS_MOUNT_MAP_SPECIAL_CHR);
+
+ if (rc != 0) {
+ if (rc == -EIO)
+ rc = -EINVAL;
+ goto out;
+ }
+
+ netpid = current->tgid;
+
+set_via_filehandle:
+ rc = CIFSSMBSetFileInfo(xid, pTcon, &info_buf, netfid, netpid);
+ if (!rc)
+ cifsInode->cifsAttrs = dosattr;
+
+ if (open_file == NULL)
+ CIFSSMBClose(xid, pTcon, netfid);
+ else
+ atomic_dec(&open_file->wrtPending);
+out:
+ return rc;
+}
+
+/*
+ * open the given file (if it isn't already), set the DELETE_ON_CLOSE bit
+ * and rename it to a random name that hopefully won't conflict with
+ * anything else.
+ */
+static int
+cifs_rename_pending_delete(char *full_path, struct inode *inode, int xid)
+{
+ int oplock = 0;
+ int rc;
+ __u16 netfid;
+ struct cifsInodeInfo *cifsInode = CIFS_I(inode);
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ struct cifsTconInfo *tcon = cifs_sb->tcon;
+ __u32 dosattr;
+ FILE_BASIC_INFO *info_buf;
+
+ rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,
+ DELETE|FILE_WRITE_ATTRIBUTES,
+ CREATE_NOT_DIR|CREATE_DELETE_ON_CLOSE,
+ &netfid, &oplock, NULL, cifs_sb->local_nls,
+ cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+ if (rc != 0)
+ goto out;
+
+ /* set ATTR_HIDDEN and clear ATTR_READONLY */
+ cifsInode = CIFS_I(inode);
+ dosattr = cifsInode->cifsAttrs & ~ATTR_READONLY;
+ if (dosattr == 0)
+ dosattr |= ATTR_NORMAL;
+ dosattr |= ATTR_HIDDEN;
+
+ info_buf = kzalloc(sizeof(*info_buf), GFP_KERNEL);
+ if (info_buf == NULL) {
+ rc = -ENOMEM;
+ goto out_close;
+ }
+ info_buf->Attributes = cpu_to_le32(dosattr);
+ rc = CIFSSMBSetFileInfo(xid, tcon, info_buf, netfid, current->tgid);
+ kfree(info_buf);
+ if (rc != 0)
+ goto out_close;
+ cifsInode->cifsAttrs = dosattr;
+
+ /* silly-rename the file */
+ CIFSSMBRenameOpenFile(xid, tcon, netfid, NULL, cifs_sb->local_nls,
+ cifs_sb->mnt_cifs_flags &
+ CIFS_MOUNT_MAP_SPECIAL_CHR);
+
+ /* set DELETE_ON_CLOSE */
+ rc = CIFSSMBSetFileDisposition(xid, tcon, true, netfid, current->tgid);
+
+ /*
+ * some samba versions return -ENOENT when we try to set the file
+ * disposition here. Likely a samba bug, but work around it for now
+ */
+ if (rc == -ENOENT)
+ rc = 0;
+
+out_close:
+ CIFSSMBClose(xid, tcon, netfid);
+out:
+ return rc;
+}
+
+int cifs_unlink(struct inode *dir, struct dentry *dentry)
{
int rc = 0;
int xid;
- struct cifs_sb_info *cifs_sb;
- struct cifsTconInfo *pTcon;
char *full_path = NULL;
- struct cifsInodeInfo *cifsInode;
- FILE_BASIC_INFO *pinfo_buf;
+ struct inode *inode = dentry->d_inode;
+ struct cifsInodeInfo *cifsInode = CIFS_I(inode);
+ struct super_block *sb = dir->i_sb;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+ struct cifsTconInfo *tcon = cifs_sb->tcon;
+ struct iattr *attrs = NULL;
+ __u32 dosattr = 0, origattr = 0;
- cFYI(1, ("cifs_unlink, inode = 0x%p", inode));
+ cFYI(1, ("cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry));
xid = GetXid();
- if (inode)
- cifs_sb = CIFS_SB(inode->i_sb);
- else
- cifs_sb = CIFS_SB(direntry->d_sb);
- pTcon = cifs_sb->tcon;
-
- /* Unlink can be called from rename so we can not grab the sem here
- since we deadlock otherwise */
-/* mutex_lock(&direntry->d_sb->s_vfs_rename_mutex);*/
- full_path = build_path_from_dentry(direntry);
-/* mutex_unlock(&direntry->d_sb->s_vfs_rename_mutex);*/
+ /* Unlink can be called from rename so we can not take the
+ * sb->s_vfs_rename_mutex here */
+ full_path = build_path_from_dentry(dentry);
if (full_path == NULL) {
FreeXid(xid);
return -ENOMEM;
}
- if ((pTcon->ses->capabilities & CAP_UNIX) &&
+ if ((tcon->ses->capabilities & CAP_UNIX) &&
(CIFS_UNIX_POSIX_PATH_OPS_CAP &
- le64_to_cpu(pTcon->fsUnixInfo.Capability))) {
- rc = CIFSPOSIXDelFile(xid, pTcon, full_path,
+ le64_to_cpu(tcon->fsUnixInfo.Capability))) {
+ rc = CIFSPOSIXDelFile(xid, tcon, full_path,
SMB_POSIX_UNLINK_FILE_TARGET, cifs_sb->local_nls,
cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
cFYI(1, ("posix del rc %d", rc));
@@ -704,124 +867,60 @@ int cifs_unlink(struct inode *inode, struct dentry *direntry)
goto psx_del_no_retry;
}
- rc = CIFSSMBDelFile(xid, pTcon, full_path, cifs_sb->local_nls,
+retry_std_delete:
+ rc = CIFSSMBDelFile(xid, tcon, full_path, cifs_sb->local_nls,
cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+
psx_del_no_retry:
if (!rc) {
- if (direntry->d_inode)
- drop_nlink(direntry->d_inode);
+ if (inode)
+ drop_nlink(inode);
} else if (rc == -ENOENT) {
- d_drop(direntry);
+ d_drop(dentry);
} else if (rc == -ETXTBSY) {
- int oplock = 0;
- __u16 netfid;
-
- rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN, DELETE,
- CREATE_NOT_DIR | CREATE_DELETE_ON_CLOSE,
- &netfid, &oplock, NULL, cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
- if (rc == 0) {
- CIFSSMBRenameOpenFile(xid, pTcon, netfid, NULL,
- cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
- CIFSSMBClose(xid, pTcon, netfid);
- if (direntry->d_inode)
- drop_nlink(direntry->d_inode);
+ rc = cifs_rename_pending_delete(full_path, inode, xid);
+ if (rc == 0)
+ drop_nlink(inode);
+ } else if (rc == -EACCES && dosattr == 0) {
+ attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
+ if (attrs == NULL) {
+ rc = -ENOMEM;
+ goto out_reval;
}
- } else if (rc == -EACCES) {
- /* try only if r/o attribute set in local lookup data? */
- pinfo_buf = kzalloc(sizeof(FILE_BASIC_INFO), GFP_KERNEL);
- if (pinfo_buf) {
- /* ATTRS set to normal clears r/o bit */
- pinfo_buf->Attributes = cpu_to_le32(ATTR_NORMAL);
- if (!(pTcon->ses->flags & CIFS_SES_NT4))
- rc = CIFSSMBSetTimes(xid, pTcon, full_path,
- pinfo_buf,
- cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
- else
- rc = -EOPNOTSUPP;
- if (rc == -EOPNOTSUPP) {
- int oplock = 0;
- __u16 netfid;
- /* rc = CIFSSMBSetAttrLegacy(xid, pTcon,
- full_path,
- (__u16)ATTR_NORMAL,
- cifs_sb->local_nls);
- For some strange reason it seems that NT4 eats the
- old setattr call without actually setting the
- attributes so on to the third attempted workaround
- */
-
- /* BB could scan to see if we already have it open
- and pass in pid of opener to function */
- rc = CIFSSMBOpen(xid, pTcon, full_path,
- FILE_OPEN, SYNCHRONIZE |
- FILE_WRITE_ATTRIBUTES, 0,
- &netfid, &oplock, NULL,
- cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
- if (rc == 0) {
- rc = CIFSSMBSetFileTimes(xid, pTcon,
- pinfo_buf,
- netfid);
- CIFSSMBClose(xid, pTcon, netfid);
- }
- }
- kfree(pinfo_buf);
- }
- if (rc == 0) {
- rc = CIFSSMBDelFile(xid, pTcon, full_path,
- cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
- if (!rc) {
- if (direntry->d_inode)
- drop_nlink(direntry->d_inode);
- } else if (rc == -ETXTBSY) {
- int oplock = 0;
- __u16 netfid;
-
- rc = CIFSSMBOpen(xid, pTcon, full_path,
- FILE_OPEN, DELETE,
- CREATE_NOT_DIR |
- CREATE_DELETE_ON_CLOSE,
- &netfid, &oplock, NULL,
- cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
- if (rc == 0) {
- CIFSSMBRenameOpenFile(xid, pTcon,
- netfid, NULL,
- cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
- CIFSSMBClose(xid, pTcon, netfid);
- if (direntry->d_inode)
- drop_nlink(direntry->d_inode);
- }
- /* BB if rc = -ETXTBUSY goto the rename logic BB */
- }
- }
- }
- if (direntry->d_inode) {
- cifsInode = CIFS_I(direntry->d_inode);
- cifsInode->time = 0; /* will force revalidate to get info
- when needed */
- direntry->d_inode->i_ctime = current_fs_time(inode->i_sb);
+ /* try to reset dos attributes */
+ origattr = cifsInode->cifsAttrs;
+ if (origattr == 0)
+ origattr |= ATTR_NORMAL;
+ dosattr = origattr & ~ATTR_READONLY;
+ if (dosattr == 0)
+ dosattr |= ATTR_NORMAL;
+ dosattr |= ATTR_HIDDEN;
+
+ rc = cifs_set_file_info(inode, attrs, xid, full_path, dosattr);
+ if (rc != 0)
+ goto out_reval;
+
+ goto retry_std_delete;
}
+
+ /* undo the setattr if we errored out and it's needed */
+ if (rc != 0 && dosattr != 0)
+ cifs_set_file_info(inode, attrs, xid, full_path, origattr);
+
+out_reval:
if (inode) {
- inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
cifsInode = CIFS_I(inode);
- cifsInode->time = 0; /* force revalidate of dir as well */
+ cifsInode->time = 0; /* will force revalidate to get info
+ when needed */
+ inode->i_ctime = current_fs_time(sb);
}
+ dir->i_ctime = dir->i_mtime = current_fs_time(sb);
+ cifsInode = CIFS_I(dir);
+ CIFS_I(dir)->time = 0; /* force revalidate of dir as well */
kfree(full_path);
+ kfree(attrs);
FreeXid(xid);
return rc;
}
@@ -866,7 +965,7 @@ static void posix_fill_in_inode(struct inode *tmp_inode,
int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
{
- int rc = 0;
+ int rc = 0, tmprc;
int xid;
struct cifs_sb_info *cifs_sb;
struct cifsTconInfo *pTcon;
@@ -928,6 +1027,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
kfree(pInfo);
goto mkdir_get_info;
}
+
/* Is an i_ino of zero legal? */
/* Are there sanity checks we can use to ensure that
the server is really filling in that field? */
@@ -984,35 +1084,52 @@ mkdir_get_info:
* failed to get it from the server or was set bogus */
if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2))
direntry->d_inode->i_nlink = 2;
+
mode &= ~current->fs->umask;
+ /* must turn on setgid bit if parent dir has it */
+ if (inode->i_mode & S_ISGID)
+ mode |= S_ISGID;
+
if (pTcon->unix_ext) {
+ struct cifs_unix_set_info_args args = {
+ .mode = mode,
+ .ctime = NO_CHANGE_64,
+ .atime = NO_CHANGE_64,
+ .mtime = NO_CHANGE_64,
+ .device = 0,
+ };
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
- CIFSSMBUnixSetPerms(xid, pTcon, full_path,
- mode,
- (__u64)current->fsuid,
- (__u64)current->fsgid,
- 0 /* dev_t */,
- cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ args.uid = (__u64)current->fsuid;
+ if (inode->i_mode & S_ISGID)
+ args.gid = (__u64)inode->i_gid;
+ else
+ args.gid = (__u64)current->fsgid;
} else {
- CIFSSMBUnixSetPerms(xid, pTcon, full_path,
- mode, (__u64)-1,
- (__u64)-1, 0 /* dev_t */,
- cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ args.uid = NO_CHANGE_64;
+ args.gid = NO_CHANGE_64;
}
+ CIFSSMBUnixSetInfo(xid, pTcon, full_path, &args,
+ cifs_sb->local_nls,
+ cifs_sb->mnt_cifs_flags &
+ CIFS_MOUNT_MAP_SPECIAL_CHR);
} else {
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) &&
(mode & S_IWUGO) == 0) {
FILE_BASIC_INFO pInfo;
+ struct cifsInodeInfo *cifsInode;
+ u32 dosattrs;
+
memset(&pInfo, 0, sizeof(pInfo));
- pInfo.Attributes = cpu_to_le32(ATTR_READONLY);
- CIFSSMBSetTimes(xid, pTcon, full_path,
- &pInfo, cifs_sb->local_nls,
+ cifsInode = CIFS_I(newinode);
+ dosattrs = cifsInode->cifsAttrs|ATTR_READONLY;
+ pInfo.Attributes = cpu_to_le32(dosattrs);
+ tmprc = CIFSSMBSetPathInfo(xid, pTcon,
+ full_path, &pInfo,
+ cifs_sb->local_nls,
cifs_sb->mnt_cifs_flags &
CIFS_MOUNT_MAP_SPECIAL_CHR);
+ if (tmprc == 0)
+ cifsInode->cifsAttrs = dosattrs;
}
if (direntry->d_inode) {
if (cifs_sb->mnt_cifs_flags &
@@ -1024,8 +1141,12 @@ mkdir_get_info:
CIFS_MOUNT_SET_UID) {
direntry->d_inode->i_uid =
current->fsuid;
- direntry->d_inode->i_gid =
- current->fsgid;
+ if (inode->i_mode & S_ISGID)
+ direntry->d_inode->i_gid =
+ inode->i_gid;
+ else
+ direntry->d_inode->i_gid =
+ current->fsgid;
}
}
}
@@ -1080,117 +1201,141 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry)
return rc;
}
+static int
+cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath,
+ struct dentry *to_dentry, const char *toPath)
+{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(from_dentry->d_sb);
+ struct cifsTconInfo *pTcon = cifs_sb->tcon;
+ __u16 srcfid;
+ int oplock, rc;
+
+ /* try path-based rename first */
+ rc = CIFSSMBRename(xid, pTcon, fromPath, toPath, cifs_sb->local_nls,
+ cifs_sb->mnt_cifs_flags &
+ CIFS_MOUNT_MAP_SPECIAL_CHR);
+
+ /*
+ * don't bother with rename by filehandle unless file is busy and
+ * source Note that cross directory moves do not work with
+ * rename by filehandle to various Windows servers.
+ */
+ if (rc == 0 || rc != -ETXTBSY)
+ return rc;
+
+ /* open the file to be renamed -- we need DELETE perms */
+ rc = CIFSSMBOpen(xid, pTcon, fromPath, FILE_OPEN, DELETE,
+ CREATE_NOT_DIR, &srcfid, &oplock, NULL,
+ cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+ CIFS_MOUNT_MAP_SPECIAL_CHR);
+
+ if (rc == 0) {
+ rc = CIFSSMBRenameOpenFile(xid, pTcon, srcfid,
+ (const char *) to_dentry->d_name.name,
+ cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+ CIFS_MOUNT_MAP_SPECIAL_CHR);
+
+ CIFSSMBClose(xid, pTcon, srcfid);
+ }
+
+ return rc;
+}
+
int cifs_rename(struct inode *source_inode, struct dentry *source_direntry,
struct inode *target_inode, struct dentry *target_direntry)
{
- char *fromName;
- char *toName;
+ char *fromName = NULL;
+ char *toName = NULL;
struct cifs_sb_info *cifs_sb_source;
struct cifs_sb_info *cifs_sb_target;
struct cifsTconInfo *pTcon;
+ FILE_UNIX_BASIC_INFO *info_buf_source = NULL;
+ FILE_UNIX_BASIC_INFO *info_buf_target;
int xid;
- int rc = 0;
-
- xid = GetXid();
+ int rc;
cifs_sb_target = CIFS_SB(target_inode->i_sb);
cifs_sb_source = CIFS_SB(source_inode->i_sb);
pTcon = cifs_sb_source->tcon;
+ xid = GetXid();
+
+ /*
+ * BB: this might be allowed if same server, but different share.
+ * Consider adding support for this
+ */
if (pTcon != cifs_sb_target->tcon) {
- FreeXid(xid);
- return -EXDEV; /* BB actually could be allowed if same server,
- but different share.
- Might eventually add support for this */
+ rc = -EXDEV;
+ goto cifs_rename_exit;
}
- /* we already have the rename sem so we do not need to grab it again
- here to protect the path integrity */
+ /*
+ * we already have the rename sem so we do not need to
+ * grab it again here to protect the path integrity
+ */
fromName = build_path_from_dentry(source_direntry);
+ if (fromName == NULL) {
+ rc = -ENOMEM;
+ goto cifs_rename_exit;
+ }
+
toName = build_path_from_dentry(target_direntry);
- if ((fromName == NULL) || (toName == NULL)) {
+ if (toName == NULL) {
rc = -ENOMEM;
goto cifs_rename_exit;
}
- rc = CIFSSMBRename(xid, pTcon, fromName, toName,
- cifs_sb_source->local_nls,
- cifs_sb_source->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ rc = cifs_do_rename(xid, source_direntry, fromName,
+ target_direntry, toName);
+
if (rc == -EEXIST) {
- /* check if they are the same file because rename of hardlinked
- files is a noop */
- FILE_UNIX_BASIC_INFO *info_buf_source;
- FILE_UNIX_BASIC_INFO *info_buf_target;
-
- info_buf_source =
- kmalloc(2 * sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
- if (info_buf_source != NULL) {
+ if (pTcon->unix_ext) {
+ /*
+ * Are src and dst hardlinks of same inode? We can
+ * only tell with unix extensions enabled
+ */
+ info_buf_source =
+ kmalloc(2 * sizeof(FILE_UNIX_BASIC_INFO),
+ GFP_KERNEL);
+ if (info_buf_source == NULL)
+ goto unlink_target;
+
info_buf_target = info_buf_source + 1;
- if (pTcon->unix_ext)
- rc = CIFSSMBUnixQPathInfo(xid, pTcon, fromName,
- info_buf_source,
- cifs_sb_source->local_nls,
- cifs_sb_source->mnt_cifs_flags &
+ rc = CIFSSMBUnixQPathInfo(xid, pTcon, fromName,
+ info_buf_source,
+ cifs_sb_source->local_nls,
+ cifs_sb_source->mnt_cifs_flags &
CIFS_MOUNT_MAP_SPECIAL_CHR);
- /* else rc is still EEXIST so will fall through to
- unlink the target and retry rename */
- if (rc == 0) {
- rc = CIFSSMBUnixQPathInfo(xid, pTcon, toName,
- info_buf_target,
+ if (rc != 0)
+ goto unlink_target;
+
+ rc = CIFSSMBUnixQPathInfo(xid, pTcon,
+ toName, info_buf_target,
cifs_sb_target->local_nls,
/* remap based on source sb */
cifs_sb_source->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
- }
- if ((rc == 0) &&
- (info_buf_source->UniqueId ==
- info_buf_target->UniqueId)) {
- /* do not rename since the files are hardlinked which
- is a noop */
- } else {
- /* we either can not tell the files are hardlinked
- (as with Windows servers) or files are not
- hardlinked so delete the target manually before
- renaming to follow POSIX rather than Windows
- semantics */
- cifs_unlink(target_inode, target_direntry);
- rc = CIFSSMBRename(xid, pTcon, fromName,
- toName,
- cifs_sb_source->local_nls,
- cifs_sb_source->mnt_cifs_flags
- & CIFS_MOUNT_MAP_SPECIAL_CHR);
- }
- kfree(info_buf_source);
- } /* if we can not get memory just leave rc as EEXIST */
- }
-
- if (rc)
- cFYI(1, ("rename rc %d", rc));
-
- if ((rc == -EIO) || (rc == -EEXIST)) {
- int oplock = 0;
- __u16 netfid;
-
- /* BB FIXME Is Generic Read correct for rename? */
- /* if renaming directory - we should not say CREATE_NOT_DIR,
- need to test renaming open directory, also GENERIC_READ
- might not right be right access to request */
- rc = CIFSSMBOpen(xid, pTcon, fromName, FILE_OPEN, GENERIC_READ,
- CREATE_NOT_DIR, &netfid, &oplock, NULL,
- cifs_sb_source->local_nls,
- cifs_sb_source->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
- if (rc == 0) {
- rc = CIFSSMBRenameOpenFile(xid, pTcon, netfid, toName,
- cifs_sb_source->local_nls,
- cifs_sb_source->mnt_cifs_flags &
CIFS_MOUNT_MAP_SPECIAL_CHR);
- CIFSSMBClose(xid, pTcon, netfid);
- }
+
+ if (rc == 0 && (info_buf_source->UniqueId ==
+ info_buf_target->UniqueId))
+ /* same file, POSIX says that this is a noop */
+ goto cifs_rename_exit;
+ } /* else ... BB we could add the same check for Windows by
+ checking the UniqueId via FILE_INTERNAL_INFO */
+unlink_target:
+ /*
+ * we either can not tell the files are hardlinked (as with
+ * Windows servers) or files are not hardlinked. Delete the
+ * target manually before renaming to follow POSIX rather than
+ * Windows semantics
+ */
+ cifs_unlink(target_inode, target_direntry);
+ rc = cifs_do_rename(xid, source_direntry, fromName,
+ target_direntry, toName);
}
cifs_rename_exit:
+ kfree(info_buf_source);
kfree(fromName);
kfree(toName);
FreeXid(xid);
@@ -1310,10 +1455,11 @@ int cifs_revalidate(struct dentry *direntry)
/* if (S_ISDIR(direntry->d_inode->i_mode))
shrink_dcache_parent(direntry); */
if (S_ISREG(direntry->d_inode->i_mode)) {
- if (direntry->d_inode->i_mapping)
+ if (direntry->d_inode->i_mapping) {
wbrc = filemap_fdatawait(direntry->d_inode->i_mapping);
if (wbrc)
CIFS_I(direntry->d_inode)->write_behind_rc = wbrc;
+ }
/* may eventually have to do this for open files too */
if (list_empty(&(cifsInode->openFileList))) {
/* changed on server - flush read ahead pages */
@@ -1413,31 +1559,209 @@ out_busy:
return -ETXTBSY;
}
-int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
+static int
+cifs_set_file_size(struct inode *inode, struct iattr *attrs,
+ int xid, char *full_path)
{
+ int rc;
+ struct cifsFileInfo *open_file;
+ struct cifsInodeInfo *cifsInode = CIFS_I(inode);
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ struct cifsTconInfo *pTcon = cifs_sb->tcon;
+
+ /*
+ * To avoid spurious oplock breaks from server, in the case of
+ * inodes that we already have open, avoid doing path based
+ * setting of file size if we can do it by handle.
+ * This keeps our caching token (oplock) and avoids timeouts
+ * when the local oplock break takes longer to flush
+ * writebehind data than the SMB timeout for the SetPathInfo
+ * request would allow
+ */
+ open_file = find_writable_file(cifsInode);
+ if (open_file) {
+ __u16 nfid = open_file->netfid;
+ __u32 npid = open_file->pid;
+ rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid,
+ npid, false);
+ atomic_dec(&open_file->wrtPending);
+ cFYI(1, ("SetFSize for attrs rc = %d", rc));
+ if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
+ unsigned int bytes_written;
+ rc = CIFSSMBWrite(xid, pTcon, nfid, 0, attrs->ia_size,
+ &bytes_written, NULL, NULL, 1);
+ cFYI(1, ("Wrt seteof rc %d", rc));
+ }
+ } else
+ rc = -EINVAL;
+
+ if (rc != 0) {
+ /* Set file size by pathname rather than by handle
+ either because no valid, writeable file handle for
+ it was found or because there was an error setting
+ it by handle */
+ rc = CIFSSMBSetEOF(xid, pTcon, full_path, attrs->ia_size,
+ false, cifs_sb->local_nls,
+ cifs_sb->mnt_cifs_flags &
+ CIFS_MOUNT_MAP_SPECIAL_CHR);
+ cFYI(1, ("SetEOF by path (setattrs) rc = %d", rc));
+ if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
+ __u16 netfid;
+ int oplock = 0;
+
+ rc = SMBLegacyOpen(xid, pTcon, full_path,
+ FILE_OPEN, GENERIC_WRITE,
+ CREATE_NOT_DIR, &netfid, &oplock, NULL,
+ cifs_sb->local_nls,
+ cifs_sb->mnt_cifs_flags &
+ CIFS_MOUNT_MAP_SPECIAL_CHR);
+ if (rc == 0) {
+ unsigned int bytes_written;
+ rc = CIFSSMBWrite(xid, pTcon, netfid, 0,
+ attrs->ia_size,
+ &bytes_written, NULL,
+ NULL, 1);
+ cFYI(1, ("wrt seteof rc %d", rc));
+ CIFSSMBClose(xid, pTcon, netfid);
+ }
+ }
+ }
+
+ if (rc == 0) {
+ rc = cifs_vmtruncate(inode, attrs->ia_size);
+ cifs_truncate_page(inode->i_mapping, inode->i_size);
+ }
+
+ return rc;
+}
+
+static int
+cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
+{
+ int rc;
int xid;
- struct cifs_sb_info *cifs_sb;
- struct cifsTconInfo *pTcon;
char *full_path = NULL;
- int rc = -EACCES;
- struct cifsFileInfo *open_file = NULL;
- FILE_BASIC_INFO time_buf;
- bool set_time = false;
- bool set_dosattr = false;
- __u64 mode = 0xFFFFFFFFFFFFFFFFULL;
- __u64 uid = 0xFFFFFFFFFFFFFFFFULL;
- __u64 gid = 0xFFFFFFFFFFFFFFFFULL;
- struct cifsInodeInfo *cifsInode;
struct inode *inode = direntry->d_inode;
+ struct cifsInodeInfo *cifsInode = CIFS_I(inode);
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ struct cifsTconInfo *pTcon = cifs_sb->tcon;
+ struct cifs_unix_set_info_args *args = NULL;
+
+ cFYI(1, ("setattr_unix on file %s attrs->ia_valid=0x%x",
+ direntry->d_name.name, attrs->ia_valid));
+
+ xid = GetXid();
+
+ if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) == 0) {
+ /* check if we have permission to change attrs */
+ rc = inode_change_ok(inode, attrs);
+ if (rc < 0)
+ goto out;
+ else
+ rc = 0;
+ }
+
+ full_path = build_path_from_dentry(direntry);
+ if (full_path == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ if ((attrs->ia_valid & ATTR_MTIME) || (attrs->ia_valid & ATTR_SIZE)) {
+ /*
+ Flush data before changing file size or changing the last
+ write time of the file on the server. If the
+ flush returns error, store it to report later and continue.
+ BB: This should be smarter. Why bother flushing pages that
+ will be truncated anyway? Also, should we error out here if
+ the flush returns error?
+ */
+ rc = filemap_write_and_wait(inode->i_mapping);
+ if (rc != 0) {
+ cifsInode->write_behind_rc = rc;
+ rc = 0;
+ }
+ }
+
+ if (attrs->ia_valid & ATTR_SIZE) {
+ rc = cifs_set_file_size(inode, attrs, xid, full_path);
+ if (rc != 0)
+ goto out;
+ }
+
+ /* skip mode change if it's just for clearing setuid/setgid */
+ if (attrs->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID))
+ attrs->ia_valid &= ~ATTR_MODE;
+
+ args = kmalloc(sizeof(*args), GFP_KERNEL);
+ if (args == NULL) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ /* set up the struct */
+ if (attrs->ia_valid & ATTR_MODE)
+ args->mode = attrs->ia_mode;
+ else
+ args->mode = NO_CHANGE_64;
+
+ if (attrs->ia_valid & ATTR_UID)
+ args->uid = attrs->ia_uid;
+ else
+ args->uid = NO_CHANGE_64;
+
+ if (attrs->ia_valid & ATTR_GID)
+ args->gid = attrs->ia_gid;
+ else
+ args->gid = NO_CHANGE_64;
+
+ if (attrs->ia_valid & ATTR_ATIME)
+ args->atime = cifs_UnixTimeToNT(attrs->ia_atime);
+ else
+ args->atime = NO_CHANGE_64;
+
+ if (attrs->ia_valid & ATTR_MTIME)
+ args->mtime = cifs_UnixTimeToNT(attrs->ia_mtime);
+ else
+ args->mtime = NO_CHANGE_64;
+
+ if (attrs->ia_valid & ATTR_CTIME)
+ args->ctime = cifs_UnixTimeToNT(attrs->ia_ctime);
+ else
+ args->ctime = NO_CHANGE_64;
+
+ args->device = 0;
+ rc = CIFSSMBUnixSetInfo(xid, pTcon, full_path, args,
+ cifs_sb->local_nls,
+ cifs_sb->mnt_cifs_flags &
+ CIFS_MOUNT_MAP_SPECIAL_CHR);
+
+ if (!rc)
+ rc = inode_setattr(inode, attrs);
+out:
+ kfree(args);
+ kfree(full_path);
+ FreeXid(xid);
+ return rc;
+}
+
+static int
+cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
+{
+ int xid;
+ struct inode *inode = direntry->d_inode;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ struct cifsInodeInfo *cifsInode = CIFS_I(inode);
+ char *full_path = NULL;
+ int rc = -EACCES;
+ __u32 dosattr = 0;
+ __u64 mode = NO_CHANGE_64;
xid = GetXid();
cFYI(1, ("setattr on file %s attrs->iavalid 0x%x",
direntry->d_name.name, attrs->ia_valid));
- cifs_sb = CIFS_SB(inode->i_sb);
- pTcon = cifs_sb->tcon;
-
if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) == 0) {
/* check if we have permission to change attrs */
rc = inode_change_ok(inode, attrs);
@@ -1453,7 +1777,6 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
FreeXid(xid);
return -ENOMEM;
}
- cifsInode = CIFS_I(inode);
if ((attrs->ia_valid & ATTR_MTIME) || (attrs->ia_valid & ATTR_SIZE)) {
/*
@@ -1472,78 +1795,8 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
}
if (attrs->ia_valid & ATTR_SIZE) {
- /* To avoid spurious oplock breaks from server, in the case of
- inodes that we already have open, avoid doing path based
- setting of file size if we can do it by handle.
- This keeps our caching token (oplock) and avoids timeouts
- when the local oplock break takes longer to flush
- writebehind data than the SMB timeout for the SetPathInfo
- request would allow */
-
- open_file = find_writable_file(cifsInode);
- if (open_file) {
- __u16 nfid = open_file->netfid;
- __u32 npid = open_file->pid;
- rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size,
- nfid, npid, false);
- atomic_dec(&open_file->wrtPending);
- cFYI(1, ("SetFSize for attrs rc = %d", rc));
- if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
- unsigned int bytes_written;
- rc = CIFSSMBWrite(xid, pTcon,
- nfid, 0, attrs->ia_size,
- &bytes_written, NULL, NULL,
- 1 /* 45 seconds */);
- cFYI(1, ("Wrt seteof rc %d", rc));
- }
- } else
- rc = -EINVAL;
-
- if (rc != 0) {
- /* Set file size by pathname rather than by handle
- either because no valid, writeable file handle for
- it was found or because there was an error setting
- it by handle */
- rc = CIFSSMBSetEOF(xid, pTcon, full_path,
- attrs->ia_size, false,
- cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
- cFYI(1, ("SetEOF by path (setattrs) rc = %d", rc));
- if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
- __u16 netfid;
- int oplock = 0;
-
- rc = SMBLegacyOpen(xid, pTcon, full_path,
- FILE_OPEN, GENERIC_WRITE,
- CREATE_NOT_DIR, &netfid, &oplock,
- NULL, cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
- if (rc == 0) {
- unsigned int bytes_written;
- rc = CIFSSMBWrite(xid, pTcon,
- netfid, 0,
- attrs->ia_size,
- &bytes_written, NULL,
- NULL, 1 /* 45 sec */);
- cFYI(1, ("wrt seteof rc %d", rc));
- CIFSSMBClose(xid, pTcon, netfid);
- }
-
- }
- }
-
- /* Server is ok setting allocation size implicitly - no need
- to call:
- CIFSSMBSetEOF(xid, pTcon, full_path, attrs->ia_size, true,
- cifs_sb->local_nls);
- */
-
- if (rc == 0) {
- rc = cifs_vmtruncate(inode, attrs->ia_size);
- cifs_truncate_page(inode->i_mapping, inode->i_size);
- } else
+ rc = cifs_set_file_size(inode, attrs, xid, full_path);
+ if (rc != 0)
goto cifs_setattr_exit;
}
@@ -1554,21 +1807,8 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
* CIFSACL support + proper Windows to Unix idmapping, we may be
* able to support this in the future.
*/
- if (!pTcon->unix_ext &&
- !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) {
+ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID))
attrs->ia_valid &= ~(ATTR_UID | ATTR_GID);
- } else {
- if (attrs->ia_valid & ATTR_UID) {
- cFYI(1, ("UID changed to %d", attrs->ia_uid));
- uid = attrs->ia_uid;
- }
- if (attrs->ia_valid & ATTR_GID) {
- cFYI(1, ("GID changed to %d", attrs->ia_gid));
- gid = attrs->ia_gid;
- }
- }
-
- time_buf.Attributes = 0;
/* skip mode change if it's just for clearing setuid/setgid */
if (attrs->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID))
@@ -1579,13 +1819,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
mode = attrs->ia_mode;
}
- if ((pTcon->unix_ext)
- && (attrs->ia_valid & (ATTR_MODE | ATTR_GID | ATTR_UID)))
- rc = CIFSSMBUnixSetPerms(xid, pTcon, full_path, mode, uid, gid,
- 0 /* dev_t */, cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
- else if (attrs->ia_valid & ATTR_MODE) {
+ if (attrs->ia_valid & ATTR_MODE) {
rc = 0;
#ifdef CONFIG_CIFS_EXPERIMENTAL
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
@@ -1594,24 +1828,19 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
#endif
if (((mode & S_IWUGO) == 0) &&
(cifsInode->cifsAttrs & ATTR_READONLY) == 0) {
- set_dosattr = true;
- time_buf.Attributes = cpu_to_le32(cifsInode->cifsAttrs |
- ATTR_READONLY);
+
+ dosattr = cifsInode->cifsAttrs | ATTR_READONLY;
+
/* fix up mode if we're not using dynperm */
if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) == 0)
attrs->ia_mode = inode->i_mode & ~S_IWUGO;
} else if ((mode & S_IWUGO) &&
(cifsInode->cifsAttrs & ATTR_READONLY)) {
- /* If file is readonly on server, we would
- not be able to write to it - so if any write
- bit is enabled for user or group or other we
- need to at least try to remove r/o dos attr */
- set_dosattr = true;
- time_buf.Attributes = cpu_to_le32(cifsInode->cifsAttrs &
- (~ATTR_READONLY));
- /* Windows ignores set to zero */
- if (time_buf.Attributes == 0)
- time_buf.Attributes |= cpu_to_le32(ATTR_NORMAL);
+
+ dosattr = cifsInode->cifsAttrs & ~ATTR_READONLY;
+ /* Attributes of 0 are ignored */
+ if (dosattr == 0)
+ dosattr |= ATTR_NORMAL;
/* reset local inode permissions to normal */
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)) {
@@ -1629,82 +1858,18 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
}
}
- if (attrs->ia_valid & ATTR_ATIME) {
- set_time = true;
- time_buf.LastAccessTime =
- cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_atime));
- } else
- time_buf.LastAccessTime = 0;
-
- if (attrs->ia_valid & ATTR_MTIME) {
- set_time = true;
- time_buf.LastWriteTime =
- cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_mtime));
- } else
- time_buf.LastWriteTime = 0;
- /* Do not set ctime explicitly unless other time
- stamps are changed explicitly (i.e. by utime()
- since we would then have a mix of client and
- server times */
-
- if (set_time && (attrs->ia_valid & ATTR_CTIME)) {
- set_time = true;
- /* Although Samba throws this field away
- it may be useful to Windows - but we do
- not want to set ctime unless some other
- timestamp is changing */
- cFYI(1, ("CIFS - CTIME changed"));
- time_buf.ChangeTime =
- cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime));
- } else
- time_buf.ChangeTime = 0;
-
- if (set_time || set_dosattr) {
- time_buf.CreationTime = 0; /* do not change */
- /* In the future we should experiment - try setting timestamps
- via Handle (SetFileInfo) instead of by path */
- if (!(pTcon->ses->flags & CIFS_SES_NT4))
- rc = CIFSSMBSetTimes(xid, pTcon, full_path, &time_buf,
- cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
- else
- rc = -EOPNOTSUPP;
-
- if (rc == -EOPNOTSUPP) {
- int oplock = 0;
- __u16 netfid;
-
- cFYI(1, ("calling SetFileInfo since SetPathInfo for "
- "times not supported by this server"));
- /* BB we could scan to see if we already have it open
- and pass in pid of opener to function */
- rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN,
- SYNCHRONIZE | FILE_WRITE_ATTRIBUTES,
- CREATE_NOT_DIR, &netfid, &oplock,
- NULL, cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
- if (rc == 0) {
- rc = CIFSSMBSetFileTimes(xid, pTcon, &time_buf,
- netfid);
- CIFSSMBClose(xid, pTcon, netfid);
- } else {
- /* BB For even older servers we could convert time_buf
- into old DOS style which uses two second
- granularity */
+ if (attrs->ia_valid & (ATTR_MTIME|ATTR_ATIME|ATTR_CTIME) ||
+ ((attrs->ia_valid & ATTR_MODE) && dosattr)) {
+ rc = cifs_set_file_info(inode, attrs, xid, full_path, dosattr);
+ /* BB: check for rc = -EOPNOTSUPP and switch to legacy mode */
- /* rc = CIFSSMBSetTimesLegacy(xid, pTcon, full_path,
- &time_buf, cifs_sb->local_nls); */
- }
- }
/* Even if error on time set, no sense failing the call if
the server would set the time to a reasonable value anyway,
and this check ensures that we are not being called from
sys_utimes in which case we ought to fail the call back to
the user when the server rejects the call */
if ((rc) && (attrs->ia_valid &
- (ATTR_MODE | ATTR_GID | ATTR_UID | ATTR_SIZE)))
+ (ATTR_MODE | ATTR_GID | ATTR_UID | ATTR_SIZE)))
rc = 0;
}
@@ -1718,6 +1883,21 @@ cifs_setattr_exit:
return rc;
}
+int
+cifs_setattr(struct dentry *direntry, struct iattr *attrs)
+{
+ struct inode *inode = direntry->d_inode;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ struct cifsTconInfo *pTcon = cifs_sb->tcon;
+
+ if (pTcon->unix_ext)
+ return cifs_setattr_unix(direntry, attrs);
+
+ return cifs_setattr_nounix(direntry, attrs);
+
+ /* BB: add cifs_setattr_legacy for really old servers */
+}
+
#if 0
void cifs_delete_inode(struct inode *inode)
{
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 4b17f8fe315..88786ba02d2 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -150,8 +150,7 @@ cifs_buf_get(void)
but it may be more efficient to always alloc same size
albeit slightly larger than necessary and maxbuffersize
defaults to this and can not be bigger */
- ret_buf = (struct smb_hdr *) mempool_alloc(cifs_req_poolp,
- GFP_KERNEL | GFP_NOFS);
+ ret_buf = mempool_alloc(cifs_req_poolp, GFP_NOFS);
/* clear the first few header bytes */
/* for most paths, more is cleared in header_assemble */
@@ -188,8 +187,7 @@ cifs_small_buf_get(void)
but it may be more efficient to always alloc same size
albeit slightly larger than necessary and maxbuffersize
defaults to this and can not be bigger */
- ret_buf = (struct smb_hdr *) mempool_alloc(cifs_sm_req_poolp,
- GFP_KERNEL | GFP_NOFS);
+ ret_buf = mempool_alloc(cifs_sm_req_poolp, GFP_NOFS);
if (ret_buf) {
/* No need to clear memory here, cleared in header assemble */
/* memset(ret_buf, 0, sizeof(struct smb_hdr) + 27);*/
@@ -313,8 +311,6 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
buffer->Flags2 = SMBFLG2_KNOWS_LONG_NAMES;
buffer->Pid = cpu_to_le16((__u16)current->tgid);
buffer->PidHigh = cpu_to_le16((__u16)(current->tgid >> 16));
- spin_lock(&GlobalMid_Lock);
- spin_unlock(&GlobalMid_Lock);
if (treeCon) {
buffer->Tid = treeCon->tid;
if (treeCon->ses) {
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 83f30695488..765adf12d54 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -640,6 +640,70 @@ static int is_dir_changed(struct file *file)
}
+static int cifs_save_resume_key(const char *current_entry,
+ struct cifsFileInfo *cifsFile)
+{
+ int rc = 0;
+ unsigned int len = 0;
+ __u16 level;
+ char *filename;
+
+ if ((cifsFile == NULL) || (current_entry == NULL))
+ return -EINVAL;
+
+ level = cifsFile->srch_inf.info_level;
+
+ if (level == SMB_FIND_FILE_UNIX) {
+ FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry;
+
+ filename = &pFindData->FileName[0];
+ if (cifsFile->srch_inf.unicode) {
+ len = cifs_unicode_bytelen(filename);
+ } else {
+ /* BB should we make this strnlen of PATH_MAX? */
+ len = strnlen(filename, PATH_MAX);
+ }
+ cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
+ } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) {
+ FILE_DIRECTORY_INFO *pFindData =
+ (FILE_DIRECTORY_INFO *)current_entry;
+ filename = &pFindData->FileName[0];
+ len = le32_to_cpu(pFindData->FileNameLength);
+ cifsFile->srch_inf.resume_key = pFindData->FileIndex;
+ } else if (level == SMB_FIND_FILE_FULL_DIRECTORY_INFO) {
+ FILE_FULL_DIRECTORY_INFO *pFindData =
+ (FILE_FULL_DIRECTORY_INFO *)current_entry;
+ filename = &pFindData->FileName[0];
+ len = le32_to_cpu(pFindData->FileNameLength);
+ cifsFile->srch_inf.resume_key = pFindData->FileIndex;
+ } else if (level == SMB_FIND_FILE_ID_FULL_DIR_INFO) {
+ SEARCH_ID_FULL_DIR_INFO *pFindData =
+ (SEARCH_ID_FULL_DIR_INFO *)current_entry;
+ filename = &pFindData->FileName[0];
+ len = le32_to_cpu(pFindData->FileNameLength);
+ cifsFile->srch_inf.resume_key = pFindData->FileIndex;
+ } else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) {
+ FILE_BOTH_DIRECTORY_INFO *pFindData =
+ (FILE_BOTH_DIRECTORY_INFO *)current_entry;
+ filename = &pFindData->FileName[0];
+ len = le32_to_cpu(pFindData->FileNameLength);
+ cifsFile->srch_inf.resume_key = pFindData->FileIndex;
+ } else if (level == SMB_FIND_FILE_INFO_STANDARD) {
+ FIND_FILE_STANDARD_INFO *pFindData =
+ (FIND_FILE_STANDARD_INFO *)current_entry;
+ filename = &pFindData->FileName[0];
+ /* one byte length, no name conversion */
+ len = (unsigned int)pFindData->FileNameLength;
+ cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
+ } else {
+ cFYI(1, ("Unknown findfirst level %d", level));
+ return -EINVAL;
+ }
+ cifsFile->srch_inf.resume_name_len = len;
+ cifsFile->srch_inf.presume_name = filename;
+ return rc;
+}
+
/* find the corresponding entry in the search */
/* Note that the SMB server returns search entries for . and .. which
complicates logic here if we choose to parse for them and we do not
@@ -690,6 +754,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
else
cifs_buf_release(cifsFile->srch_inf.
ntwrk_buf_start);
+ cifsFile->srch_inf.ntwrk_buf_start = NULL;
}
rc = initiate_cifs_search(xid, file);
if (rc) {
@@ -702,6 +767,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) &&
(rc == 0) && !cifsFile->srch_inf.endOfSearch) {
cFYI(1, ("calling findnext2"));
+ cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile);
rc = CIFSFindNext(xid, pTcon, cifsFile->netfid,
&cifsFile->srch_inf);
if (rc)
@@ -918,69 +984,6 @@ static int cifs_filldir(char *pfindEntry, struct file *file,
return rc;
}
-static int cifs_save_resume_key(const char *current_entry,
- struct cifsFileInfo *cifsFile)
-{
- int rc = 0;
- unsigned int len = 0;
- __u16 level;
- char *filename;
-
- if ((cifsFile == NULL) || (current_entry == NULL))
- return -EINVAL;
-
- level = cifsFile->srch_inf.info_level;
-
- if (level == SMB_FIND_FILE_UNIX) {
- FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry;
-
- filename = &pFindData->FileName[0];
- if (cifsFile->srch_inf.unicode) {
- len = cifs_unicode_bytelen(filename);
- } else {
- /* BB should we make this strnlen of PATH_MAX? */
- len = strnlen(filename, PATH_MAX);
- }
- cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
- } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) {
- FILE_DIRECTORY_INFO *pFindData =
- (FILE_DIRECTORY_INFO *)current_entry;
- filename = &pFindData->FileName[0];
- len = le32_to_cpu(pFindData->FileNameLength);
- cifsFile->srch_inf.resume_key = pFindData->FileIndex;
- } else if (level == SMB_FIND_FILE_FULL_DIRECTORY_INFO) {
- FILE_FULL_DIRECTORY_INFO *pFindData =
- (FILE_FULL_DIRECTORY_INFO *)current_entry;
- filename = &pFindData->FileName[0];
- len = le32_to_cpu(pFindData->FileNameLength);
- cifsFile->srch_inf.resume_key = pFindData->FileIndex;
- } else if (level == SMB_FIND_FILE_ID_FULL_DIR_INFO) {
- SEARCH_ID_FULL_DIR_INFO *pFindData =
- (SEARCH_ID_FULL_DIR_INFO *)current_entry;
- filename = &pFindData->FileName[0];
- len = le32_to_cpu(pFindData->FileNameLength);
- cifsFile->srch_inf.resume_key = pFindData->FileIndex;
- } else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) {
- FILE_BOTH_DIRECTORY_INFO *pFindData =
- (FILE_BOTH_DIRECTORY_INFO *)current_entry;
- filename = &pFindData->FileName[0];
- len = le32_to_cpu(pFindData->FileNameLength);
- cifsFile->srch_inf.resume_key = pFindData->FileIndex;
- } else if (level == SMB_FIND_FILE_INFO_STANDARD) {
- FIND_FILE_STANDARD_INFO *pFindData =
- (FIND_FILE_STANDARD_INFO *)current_entry;
- filename = &pFindData->FileName[0];
- /* one byte length, no name conversion */
- len = (unsigned int)pFindData->FileNameLength;
- cifsFile->srch_inf.resume_key = pFindData->ResumeKey;
- } else {
- cFYI(1, ("Unknown findfirst level %d", level));
- return -EINVAL;
- }
- cifsFile->srch_inf.resume_name_len = len;
- cifsFile->srch_inf.presume_name = filename;
- return rc;
-}
int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
{
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index ed150efbe27..2851d5da0c8 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -409,6 +409,8 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
#ifdef CONFIG_CIFS_WEAK_PW_HASH
char lnm_session_key[CIFS_SESS_KEY_SIZE];
+ pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE;
+
/* no capabilities flags in old lanman negotiation */
pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE);
@@ -505,7 +507,7 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
unicode_ssetup_strings(&bcc_ptr, ses, nls_cp);
} else
ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
- } else if (type == Kerberos) {
+ } else if (type == Kerberos || type == MSKerberos) {
#ifdef CONFIG_CIFS_UPCALL
struct cifs_spnego_msg *msg;
spnego_key = cifs_get_spnego_key(ses);
@@ -516,6 +518,15 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
}
msg = spnego_key->payload.data;
+ /* check version field to make sure that cifs.upcall is
+ sending us a response in an expected form */
+ if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) {
+ cERROR(1, ("incorrect version of cifs.upcall (expected"
+ " %d but got %d)",
+ CIFS_SPNEGO_UPCALL_VERSION, msg->version));
+ rc = -EKEYREJECTED;
+ goto ssetup_exit;
+ }
/* bail out if key is too long */
if (msg->sesskey_len >
sizeof(ses->server->mac_signing_key.data.krb5)) {
@@ -613,8 +624,10 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time,
ses, nls_cp);
ssetup_exit:
- if (spnego_key)
+ if (spnego_key) {
+ key_revoke(spnego_key);
key_put(spnego_key);
+ }
kfree(str_area);
if (resp_buf_type == CIFS_SMALL_BUFFER) {
cFYI(1, ("ssetup freeing small buf %p", iov[0].iov_base));
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 000ac509c98..bf0e6d8e382 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -50,8 +50,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct cifsSesInfo *ses)
return NULL;
}
- temp = (struct mid_q_entry *) mempool_alloc(cifs_mid_poolp,
- GFP_KERNEL | GFP_NOFS);
+ temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS);
if (temp == NULL)
return temp;
else {
@@ -265,6 +264,7 @@ smb_send2(struct socket *ssocket, struct kvec *iov, int n_vec,
cFYI(1, ("Sending smb: total_len %d", total_len));
dump_smb(smb_buffer, len);
+ i = 0;
while (total_len) {
rc = kernel_sendmsg(ssocket, &smb_msg, &iov[first_vec],
n_vec - first_vec, total_len);
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index e1c854890f9..bf4a3fd3c8e 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -28,11 +28,9 @@ int coda_fake_statfs;
char * coda_f2s(struct CodaFid *f)
{
static char s[60];
-#ifdef CONFIG_CODA_FS_OLD_API
- sprintf(s, "(%08x.%08x.%08x)", f->opaque[0], f->opaque[1], f->opaque[2]);
-#else
+
sprintf(s, "(%08x.%08x.%08x.%08x)", f->opaque[0], f->opaque[1], f->opaque[2], f->opaque[3]);
-#endif
+
return s;
}
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 3d2580e00a3..c5916228243 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -137,9 +137,11 @@ exit:
}
-int coda_permission(struct inode *inode, int mask, struct nameidata *nd)
+int coda_permission(struct inode *inode, int mask)
{
int error = 0;
+
+ mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
if (!mask)
return 0;
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 2f58dfc7008..830f51abb97 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -58,7 +58,7 @@ static void coda_destroy_inode(struct inode *inode)
kmem_cache_free(coda_inode_cachep, ITOC(inode));
}
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
{
struct coda_inode_info *ei = (struct coda_inode_info *) foo;
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index c21a1f552a6..c51365422aa 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -24,8 +24,7 @@
#include <linux/coda_psdev.h>
/* pioctl ops */
-static int coda_ioctl_permission(struct inode *inode, int mask,
- struct nameidata *nd);
+static int coda_ioctl_permission(struct inode *inode, int mask);
static int coda_pioctl(struct inode * inode, struct file * filp,
unsigned int cmd, unsigned long user_data);
@@ -42,8 +41,7 @@ const struct file_operations coda_ioctl_operations = {
};
/* the coda pioctl inode ops */
-static int coda_ioctl_permission(struct inode *inode, int mask,
- struct nameidata *nd)
+static int coda_ioctl_permission(struct inode *inode, int mask)
{
return 0;
}
@@ -51,7 +49,7 @@ static int coda_ioctl_permission(struct inode *inode, int mask,
static int coda_pioctl(struct inode * inode, struct file * filp,
unsigned int cmd, unsigned long user_data)
{
- struct nameidata nd;
+ struct path path;
int error;
struct PioctlData data;
struct inode *target_inode = NULL;
@@ -66,21 +64,21 @@ static int coda_pioctl(struct inode * inode, struct file * filp,
* Look up the pathname. Note that the pathname is in
* user memory, and namei takes care of this
*/
- if ( data.follow ) {
- error = user_path_walk(data.path, &nd);
+ if (data.follow) {
+ error = user_path(data.path, &path);
} else {
- error = user_path_walk_link(data.path, &nd);
+ error = user_lpath(data.path, &path);
}
if ( error ) {
return error;
} else {
- target_inode = nd.path.dentry->d_inode;
+ target_inode = path.dentry->d_inode;
}
/* return if it is not a Coda inode */
if ( target_inode->i_sb != inode->i_sb ) {
- path_put(&nd.path);
+ path_put(&path);
return -EINVAL;
}
@@ -89,7 +87,7 @@ static int coda_pioctl(struct inode * inode, struct file * filp,
error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data);
- path_put(&nd.path);
+ path_put(&path);
return error;
}
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index e3eb3556622..0d9b80ec689 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -362,8 +362,9 @@ static int init_coda_psdev(void)
goto out_chrdev;
}
for (i = 0; i < MAX_CODADEVS; i++)
- device_create(coda_psdev_class, NULL,
- MKDEV(CODA_PSDEV_MAJOR,i), "cfs%d", i);
+ device_create_drvdata(coda_psdev_class, NULL,
+ MKDEV(CODA_PSDEV_MAJOR, i),
+ NULL, "cfs%d", i);
coda_sysctl_init();
goto out;
@@ -377,11 +378,7 @@ MODULE_AUTHOR("Jan Harkes, Peter J. Braam");
MODULE_DESCRIPTION("Coda Distributed File System VFS interface");
MODULE_ALIAS_CHARDEV_MAJOR(CODA_PSDEV_MAJOR);
MODULE_LICENSE("GPL");
-#ifdef CONFIG_CODA_FS_OLD_API
-MODULE_VERSION("5.3.21");
-#else
MODULE_VERSION("6.6");
-#endif
static int __init init_coda(void)
{
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index 359e531094d..ce432bca95d 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -52,12 +52,8 @@ static void *alloc_upcall(int opcode, int size)
inp->ih.opcode = opcode;
inp->ih.pid = current->pid;
inp->ih.pgid = task_pgrp_nr(current);
-#ifdef CONFIG_CODA_FS_OLD_API
- memset(&inp->ih.cred, 0, sizeof(struct coda_cred));
- inp->ih.cred.cr_fsuid = current->fsuid;
-#else
inp->ih.uid = current->fsuid;
-#endif
+
return (void*)inp;
}
@@ -166,20 +162,11 @@ int venus_close(struct super_block *sb, struct CodaFid *fid, int flags,
union inputArgs *inp;
union outputArgs *outp;
int insize, outsize, error;
-#ifdef CONFIG_CODA_FS_OLD_API
- struct coda_cred cred = { 0, };
- cred.cr_fsuid = uid;
-#endif
insize = SIZE(release);
UPARG(CODA_CLOSE);
-#ifdef CONFIG_CODA_FS_OLD_API
- memcpy(&(inp->ih.cred), &cred, sizeof(cred));
-#else
inp->ih.uid = uid;
-#endif
-
inp->coda_close.VFid = *fid;
inp->coda_close.flags = flags;
diff --git a/fs/compat.c b/fs/compat.c
index ed43e17a5dc..075d0509970 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -197,8 +197,8 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
{
if (sizeof ubuf->f_blocks == 4) {
- if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail) &
- 0xffffffff00000000ULL)
+ if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail |
+ kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL)
return -EOVERFLOW;
/* f_files and f_ffree may be -1; it's okay
* to stuff that into 32 bits */
@@ -234,18 +234,18 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
* The following statfs calls are copies of code from fs/open.c and
* should be checked against those from time to time
*/
-asmlinkage long compat_sys_statfs(const char __user *path, struct compat_statfs __user *buf)
+asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf)
{
- struct nameidata nd;
+ struct path path;
int error;
- error = user_path_walk(path, &nd);
+ error = user_path(pathname, &path);
if (!error) {
struct kstatfs tmp;
- error = vfs_statfs(nd.path.dentry, &tmp);
+ error = vfs_statfs(path.dentry, &tmp);
if (!error)
error = put_compat_statfs(buf, &tmp);
- path_put(&nd.path);
+ path_put(&path);
}
return error;
}
@@ -271,8 +271,8 @@ out:
static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstatfs *kbuf)
{
if (sizeof ubuf->f_blocks == 4) {
- if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail) &
- 0xffffffff00000000ULL)
+ if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail |
+ kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL)
return -EOVERFLOW;
/* f_files and f_ffree may be -1; it's okay
* to stuff that into 32 bits */
@@ -299,21 +299,21 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat
return 0;
}
-asmlinkage long compat_sys_statfs64(const char __user *path, compat_size_t sz, struct compat_statfs64 __user *buf)
+asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t sz, struct compat_statfs64 __user *buf)
{
- struct nameidata nd;
+ struct path path;
int error;
if (sz != sizeof(*buf))
return -EINVAL;
- error = user_path_walk(path, &nd);
+ error = user_path(pathname, &path);
if (!error) {
struct kstatfs tmp;
- error = vfs_statfs(nd.path.dentry, &tmp);
+ error = vfs_statfs(path.dentry, &tmp);
if (!error)
error = put_compat_statfs64(buf, &tmp);
- path_put(&nd.path);
+ path_put(&path);
}
return error;
}
@@ -792,8 +792,10 @@ static int compat_fillonedir(void *__buf, const char *name, int namlen,
if (buf->result)
return -EINVAL;
d_ino = ino;
- if (sizeof(d_ino) < sizeof(ino) && d_ino != ino)
+ if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
+ buf->result = -EOVERFLOW;
return -EOVERFLOW;
+ }
buf->result++;
dirent = buf->dirent;
if (!access_ok(VERIFY_WRITE, dirent,
@@ -862,8 +864,10 @@ static int compat_filldir(void *__buf, const char *name, int namlen,
if (reclen > buf->count)
return -EINVAL;
d_ino = ino;
- if (sizeof(d_ino) < sizeof(ino) && d_ino != ino)
+ if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
+ buf->error = -EOVERFLOW;
return -EOVERFLOW;
+ }
dirent = buf->previous;
if (dirent) {
if (__put_user(offset, &dirent->d_off))
@@ -2131,9 +2135,9 @@ asmlinkage long compat_sys_epoll_pwait(int epfd,
#ifdef CONFIG_SIGNALFD
-asmlinkage long compat_sys_signalfd(int ufd,
- const compat_sigset_t __user *sigmask,
- compat_size_t sigsetsize)
+asmlinkage long compat_sys_signalfd4(int ufd,
+ const compat_sigset_t __user *sigmask,
+ compat_size_t sigsetsize, int flags)
{
compat_sigset_t ss32;
sigset_t tmp;
@@ -2148,9 +2152,15 @@ asmlinkage long compat_sys_signalfd(int ufd,
if (copy_to_user(ksigmask, &tmp, sizeof(sigset_t)))
return -EFAULT;
- return sys_signalfd(ufd, ksigmask, sizeof(sigset_t));
+ return sys_signalfd4(ufd, ksigmask, sizeof(sigset_t), flags);
}
+asmlinkage long compat_sys_signalfd(int ufd,
+ const compat_sigset_t __user *sigmask,
+ compat_size_t sigsetsize)
+{
+ return compat_sys_signalfd4(ufd, sigmask, sigsetsize, 0);
+}
#endif /* CONFIG_SIGNALFD */
#ifdef CONFIG_TIMERFD
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 97dba0d9234..5235c67e759 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -25,7 +25,6 @@
#include <linux/slab.h>
#include <linux/raid/md.h>
#include <linux/kd.h>
-#include <linux/dirent.h>
#include <linux/route.h>
#include <linux/in6.h>
#include <linux/ipv6_route.h>
@@ -58,7 +57,6 @@
#include <linux/syscalls.h>
#include <linux/i2c.h>
#include <linux/i2c-dev.h>
-#include <linux/wireless.h>
#include <linux/atalk.h>
#include <linux/loop.h>
@@ -69,9 +67,11 @@
#include <linux/capi.h>
#include <linux/gigaset_dev.h>
+#ifdef CONFIG_BLOCK
#include <scsi/scsi.h>
#include <scsi/scsi_ioctl.h>
#include <scsi/sg.h>
+#endif
#include <asm/uaccess.h>
#include <linux/ethtool.h>
@@ -1757,64 +1757,6 @@ static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, unsigned long a
return sys_ioctl(fd, cmd, (unsigned long)tdata);
}
-struct compat_iw_point {
- compat_caddr_t pointer;
- __u16 length;
- __u16 flags;
-};
-
-static int do_wireless_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg)
-{
- struct iwreq __user *iwr;
- struct iwreq __user *iwr_u;
- struct iw_point __user *iwp;
- struct compat_iw_point __user *iwp_u;
- compat_caddr_t pointer_u;
- void __user *pointer;
- __u16 length, flags;
- int ret;
-
- iwr_u = compat_ptr(arg);
- iwp_u = (struct compat_iw_point __user *) &iwr_u->u.data;
- iwr = compat_alloc_user_space(sizeof(*iwr));
- if (iwr == NULL)
- return -ENOMEM;
-
- iwp = &iwr->u.data;
-
- if (!access_ok(VERIFY_WRITE, iwr, sizeof(*iwr)))
- return -EFAULT;
-
- if (__copy_in_user(&iwr->ifr_ifrn.ifrn_name[0],
- &iwr_u->ifr_ifrn.ifrn_name[0],
- sizeof(iwr->ifr_ifrn.ifrn_name)))
- return -EFAULT;
-
- if (__get_user(pointer_u, &iwp_u->pointer) ||
- __get_user(length, &iwp_u->length) ||
- __get_user(flags, &iwp_u->flags))
- return -EFAULT;
-
- if (__put_user(compat_ptr(pointer_u), &iwp->pointer) ||
- __put_user(length, &iwp->length) ||
- __put_user(flags, &iwp->flags))
- return -EFAULT;
-
- ret = sys_ioctl(fd, cmd, (unsigned long) iwr);
-
- if (__get_user(pointer, &iwp->pointer) ||
- __get_user(length, &iwp->length) ||
- __get_user(flags, &iwp->flags))
- return -EFAULT;
-
- if (__put_user(ptr_to_compat(pointer), &iwp_u->pointer) ||
- __put_user(length, &iwp_u->length) ||
- __put_user(flags, &iwp_u->flags))
- return -EFAULT;
-
- return ret;
-}
-
/* Since old style bridge ioctl's endup using SIOCDEVPRIVATE
* for some operations; this forces use of the newer bridge-utils that
* use compatiable ioctls
@@ -2024,6 +1966,7 @@ COMPATIBLE_IOCTL(GIO_UNISCRNMAP)
COMPATIBLE_IOCTL(PIO_UNISCRNMAP)
COMPATIBLE_IOCTL(PIO_FONTRESET)
COMPATIBLE_IOCTL(PIO_UNIMAPCLR)
+#ifdef CONFIG_BLOCK
/* Big S */
COMPATIBLE_IOCTL(SCSI_IOCTL_GET_IDLUN)
COMPATIBLE_IOCTL(SCSI_IOCTL_DOORLOCK)
@@ -2033,6 +1976,7 @@ COMPATIBLE_IOCTL(SCSI_IOCTL_GET_BUS_NUMBER)
COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND)
COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST)
COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI)
+#endif
/* Big T */
COMPATIBLE_IOCTL(TUNSETNOCSUM)
COMPATIBLE_IOCTL(TUNSETDEBUG)
@@ -2103,6 +2047,7 @@ COMPATIBLE_IOCTL(SIOCGIFVLAN)
COMPATIBLE_IOCTL(SIOCSIFVLAN)
COMPATIBLE_IOCTL(SIOCBRADDBR)
COMPATIBLE_IOCTL(SIOCBRDELBR)
+#ifdef CONFIG_BLOCK
/* SG stuff */
COMPATIBLE_IOCTL(SG_SET_TIMEOUT)
COMPATIBLE_IOCTL(SG_GET_TIMEOUT)
@@ -2127,6 +2072,7 @@ COMPATIBLE_IOCTL(SG_SCSI_RESET)
COMPATIBLE_IOCTL(SG_GET_REQUEST_TABLE)
COMPATIBLE_IOCTL(SG_SET_KEEP_ORPHAN)
COMPATIBLE_IOCTL(SG_GET_KEEP_ORPHAN)
+#endif
/* PPP stuff */
COMPATIBLE_IOCTL(PPPIOCGFLAGS)
COMPATIBLE_IOCTL(PPPIOCSFLAGS)
@@ -2350,8 +2296,6 @@ COMPATIBLE_IOCTL(AUTOFS_IOC_PROTOVER)
COMPATIBLE_IOCTL(AUTOFS_IOC_EXPIRE)
COMPATIBLE_IOCTL(AUTOFS_IOC_EXPIRE_MULTI)
COMPATIBLE_IOCTL(AUTOFS_IOC_PROTOSUBVER)
-COMPATIBLE_IOCTL(AUTOFS_IOC_ASKREGHOST)
-COMPATIBLE_IOCTL(AUTOFS_IOC_TOGGLEREGHOST)
COMPATIBLE_IOCTL(AUTOFS_IOC_ASKUMOUNT)
/* Raw devices */
COMPATIBLE_IOCTL(RAW_SETBIND)
@@ -2399,6 +2343,7 @@ COMPATIBLE_IOCTL(HCIGETDEVLIST)
COMPATIBLE_IOCTL(HCIGETDEVINFO)
COMPATIBLE_IOCTL(HCIGETCONNLIST)
COMPATIBLE_IOCTL(HCIGETCONNINFO)
+COMPATIBLE_IOCTL(HCIGETAUTHINFO)
COMPATIBLE_IOCTL(HCISETRAW)
COMPATIBLE_IOCTL(HCISETSCAN)
COMPATIBLE_IOCTL(HCISETAUTH)
@@ -2495,36 +2440,6 @@ COMPATIBLE_IOCTL(I2C_TENBIT)
COMPATIBLE_IOCTL(I2C_PEC)
COMPATIBLE_IOCTL(I2C_RETRIES)
COMPATIBLE_IOCTL(I2C_TIMEOUT)
-/* wireless */
-COMPATIBLE_IOCTL(SIOCSIWCOMMIT)
-COMPATIBLE_IOCTL(SIOCGIWNAME)
-COMPATIBLE_IOCTL(SIOCSIWNWID)
-COMPATIBLE_IOCTL(SIOCGIWNWID)
-COMPATIBLE_IOCTL(SIOCSIWFREQ)
-COMPATIBLE_IOCTL(SIOCGIWFREQ)
-COMPATIBLE_IOCTL(SIOCSIWMODE)
-COMPATIBLE_IOCTL(SIOCGIWMODE)
-COMPATIBLE_IOCTL(SIOCSIWSENS)
-COMPATIBLE_IOCTL(SIOCGIWSENS)
-COMPATIBLE_IOCTL(SIOCSIWRANGE)
-COMPATIBLE_IOCTL(SIOCSIWPRIV)
-COMPATIBLE_IOCTL(SIOCSIWSTATS)
-COMPATIBLE_IOCTL(SIOCSIWAP)
-COMPATIBLE_IOCTL(SIOCGIWAP)
-COMPATIBLE_IOCTL(SIOCSIWRATE)
-COMPATIBLE_IOCTL(SIOCGIWRATE)
-COMPATIBLE_IOCTL(SIOCSIWRTS)
-COMPATIBLE_IOCTL(SIOCGIWRTS)
-COMPATIBLE_IOCTL(SIOCSIWFRAG)
-COMPATIBLE_IOCTL(SIOCGIWFRAG)
-COMPATIBLE_IOCTL(SIOCSIWTXPOW)
-COMPATIBLE_IOCTL(SIOCGIWTXPOW)
-COMPATIBLE_IOCTL(SIOCSIWRETRY)
-COMPATIBLE_IOCTL(SIOCGIWRETRY)
-COMPATIBLE_IOCTL(SIOCSIWPOWER)
-COMPATIBLE_IOCTL(SIOCGIWPOWER)
-COMPATIBLE_IOCTL(SIOCSIWAUTH)
-COMPATIBLE_IOCTL(SIOCGIWAUTH)
/* hiddev */
COMPATIBLE_IOCTL(HIDIOCGVERSION)
COMPATIBLE_IOCTL(HIDIOCAPPLICATION)
@@ -2755,29 +2670,7 @@ COMPATIBLE_IOCTL(USBDEVFS_IOCTL32)
HANDLE_IOCTL(I2C_FUNCS, w_long)
HANDLE_IOCTL(I2C_RDWR, do_i2c_rdwr_ioctl)
HANDLE_IOCTL(I2C_SMBUS, do_i2c_smbus_ioctl)
-/* wireless */
-HANDLE_IOCTL(SIOCGIWRANGE, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCGIWPRIV, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCGIWSTATS, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCSIWSPY, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCGIWSPY, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCSIWTHRSPY, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCGIWTHRSPY, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCSIWMLME, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCGIWAPLIST, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCSIWSCAN, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCGIWSCAN, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCSIWESSID, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCGIWESSID, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCSIWNICKN, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCGIWNICKN, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCSIWENCODE, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCGIWENCODE, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCSIWGENIE, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCGIWGENIE, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCSIWENCODEEXT, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCGIWENCODEEXT, do_wireless_ioctl)
-HANDLE_IOCTL(SIOCSIWPMKSA, do_wireless_ioctl)
+/* bridge */
HANDLE_IOCTL(SIOCSIFBR, old_bridge_ioctl)
HANDLE_IOCTL(SIOCGIFBR, old_bridge_ioctl)
/* Not implemented in the native kernel */
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index cca98609aa7..762d287123c 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -26,6 +26,7 @@
#include <linux/slab.h>
#include <linux/list.h>
+#include <linux/spinlock.h>
struct configfs_dirent {
atomic_t s_count;
@@ -47,8 +48,13 @@ struct configfs_dirent {
#define CONFIGFS_USET_DIR 0x0040
#define CONFIGFS_USET_DEFAULT 0x0080
#define CONFIGFS_USET_DROPPING 0x0100
+#define CONFIGFS_USET_IN_MKDIR 0x0200
+#define CONFIGFS_USET_CREATING 0x0400
#define CONFIGFS_NOT_PINNED (CONFIGFS_ITEM_ATTR)
+extern struct mutex configfs_symlink_mutex;
+extern spinlock_t configfs_dirent_lock;
+
extern struct vfsmount * configfs_mount;
extern struct kmem_cache *configfs_dir_cachep;
@@ -62,6 +68,7 @@ extern void configfs_inode_exit(void);
extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
extern int configfs_make_dirent(struct configfs_dirent *,
struct dentry *, void *, umode_t, int);
+extern int configfs_dirent_is_ready(struct configfs_dirent *);
extern int configfs_add_file(struct dentry *, const struct configfs_attribute *, int);
extern void configfs_hash_and_remove(struct dentry * dir, const char * name);
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index a48dc7dd876..8e93341f3e8 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -30,11 +30,25 @@
#include <linux/mount.h>
#include <linux/module.h>
#include <linux/slab.h>
+#include <linux/err.h>
#include <linux/configfs.h>
#include "configfs_internal.h"
DECLARE_RWSEM(configfs_rename_sem);
+/*
+ * Protects mutations of configfs_dirent linkage together with proper i_mutex
+ * Also protects mutations of symlinks linkage to target configfs_dirent
+ * Mutators of configfs_dirent linkage must *both* have the proper inode locked
+ * and configfs_dirent_lock locked, in that order.
+ * This allows one to safely traverse configfs_dirent trees and symlinks without
+ * having to lock inodes.
+ *
+ * Protects setting of CONFIGFS_USET_DROPPING: checking the flag
+ * unlocked is not reliable unless in detach_groups() called from
+ * rmdir()/unregister() and from configfs_attach_group()
+ */
+DEFINE_SPINLOCK(configfs_dirent_lock);
static void configfs_d_iput(struct dentry * dentry,
struct inode * inode)
@@ -74,13 +88,20 @@ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * pare
sd = kmem_cache_zalloc(configfs_dir_cachep, GFP_KERNEL);
if (!sd)
- return NULL;
+ return ERR_PTR(-ENOMEM);
atomic_set(&sd->s_count, 1);
INIT_LIST_HEAD(&sd->s_links);
INIT_LIST_HEAD(&sd->s_children);
- list_add(&sd->s_sibling, &parent_sd->s_children);
sd->s_element = element;
+ spin_lock(&configfs_dirent_lock);
+ if (parent_sd->s_type & CONFIGFS_USET_DROPPING) {
+ spin_unlock(&configfs_dirent_lock);
+ kmem_cache_free(configfs_dir_cachep, sd);
+ return ERR_PTR(-ENOENT);
+ }
+ list_add(&sd->s_sibling, &parent_sd->s_children);
+ spin_unlock(&configfs_dirent_lock);
return sd;
}
@@ -118,8 +139,8 @@ int configfs_make_dirent(struct configfs_dirent * parent_sd,
struct configfs_dirent * sd;
sd = configfs_new_dirent(parent_sd, element);
- if (!sd)
- return -ENOMEM;
+ if (IS_ERR(sd))
+ return PTR_ERR(sd);
sd->s_mode = mode;
sd->s_type = type;
@@ -164,7 +185,7 @@ static int create_dir(struct config_item * k, struct dentry * p,
error = configfs_dirent_exists(p->d_fsdata, d->d_name.name);
if (!error)
error = configfs_make_dirent(p->d_fsdata, d, k, mode,
- CONFIGFS_DIR);
+ CONFIGFS_DIR | CONFIGFS_USET_CREATING);
if (!error) {
error = configfs_create(d, mode, init_dir);
if (!error) {
@@ -173,7 +194,9 @@ static int create_dir(struct config_item * k, struct dentry * p,
} else {
struct configfs_dirent *sd = d->d_fsdata;
if (sd) {
+ spin_lock(&configfs_dirent_lock);
list_del_init(&sd->s_sibling);
+ spin_unlock(&configfs_dirent_lock);
configfs_put(sd);
}
}
@@ -186,6 +209,9 @@ static int create_dir(struct config_item * k, struct dentry * p,
* configfs_create_dir - create a directory for an config_item.
* @item: config_itemwe're creating directory for.
* @dentry: config_item's dentry.
+ *
+ * Note: user-created entries won't be allowed under this new directory
+ * until it is validated by configfs_dir_set_ready()
*/
static int configfs_create_dir(struct config_item * item, struct dentry *dentry)
@@ -208,6 +234,44 @@ static int configfs_create_dir(struct config_item * item, struct dentry *dentry)
return error;
}
+/*
+ * Allow userspace to create new entries under a new directory created with
+ * configfs_create_dir(), and under all of its chidlren directories recursively.
+ * @sd configfs_dirent of the new directory to validate
+ *
+ * Caller must hold configfs_dirent_lock.
+ */
+static void configfs_dir_set_ready(struct configfs_dirent *sd)
+{
+ struct configfs_dirent *child_sd;
+
+ sd->s_type &= ~CONFIGFS_USET_CREATING;
+ list_for_each_entry(child_sd, &sd->s_children, s_sibling)
+ if (child_sd->s_type & CONFIGFS_USET_CREATING)
+ configfs_dir_set_ready(child_sd);
+}
+
+/*
+ * Check that a directory does not belong to a directory hierarchy being
+ * attached and not validated yet.
+ * @sd configfs_dirent of the directory to check
+ *
+ * @return non-zero iff the directory was validated
+ *
+ * Note: takes configfs_dirent_lock, so the result may change from false to true
+ * in two consecutive calls, but never from true to false.
+ */
+int configfs_dirent_is_ready(struct configfs_dirent *sd)
+{
+ int ret;
+
+ spin_lock(&configfs_dirent_lock);
+ ret = !(sd->s_type & CONFIGFS_USET_CREATING);
+ spin_unlock(&configfs_dirent_lock);
+
+ return ret;
+}
+
int configfs_create_link(struct configfs_symlink *sl,
struct dentry *parent,
struct dentry *dentry)
@@ -224,7 +288,9 @@ int configfs_create_link(struct configfs_symlink *sl,
else {
struct configfs_dirent *sd = dentry->d_fsdata;
if (sd) {
+ spin_lock(&configfs_dirent_lock);
list_del_init(&sd->s_sibling);
+ spin_unlock(&configfs_dirent_lock);
configfs_put(sd);
}
}
@@ -238,7 +304,9 @@ static void remove_dir(struct dentry * d)
struct configfs_dirent * sd;
sd = d->d_fsdata;
+ spin_lock(&configfs_dirent_lock);
list_del_init(&sd->s_sibling);
+ spin_unlock(&configfs_dirent_lock);
configfs_put(sd);
if (d->d_inode)
simple_rmdir(parent->d_inode,d);
@@ -256,6 +324,8 @@ static void remove_dir(struct dentry * d)
* The only thing special about this is that we remove any files in
* the directory before we remove the directory, and we've inlined
* what used to be configfs_rmdir() below, instead of calling separately.
+ *
+ * Caller holds the mutex of the item's inode
*/
static void configfs_remove_dir(struct config_item * item)
@@ -303,7 +373,19 @@ static struct dentry * configfs_lookup(struct inode *dir,
struct configfs_dirent * parent_sd = dentry->d_parent->d_fsdata;
struct configfs_dirent * sd;
int found = 0;
- int err = 0;
+ int err;
+
+ /*
+ * Fake invisibility if dir belongs to a group/default groups hierarchy
+ * being attached
+ *
+ * This forbids userspace to read/write attributes of items which may
+ * not complete their initialization, since the dentries of the
+ * attributes won't be instantiated.
+ */
+ err = -ENOENT;
+ if (!configfs_dirent_is_ready(parent_sd))
+ goto out;
list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
if (sd->s_type & CONFIGFS_NOT_PINNED) {
@@ -326,41 +408,49 @@ static struct dentry * configfs_lookup(struct inode *dir,
return simple_lookup(dir, dentry, nd);
}
+out:
return ERR_PTR(err);
}
/*
* Only subdirectories count here. Files (CONFIGFS_NOT_PINNED) are
- * attributes and are removed by rmdir(). We recurse, taking i_mutex
- * on all children that are candidates for default detach. If the
- * result is clean, then configfs_detach_group() will handle dropping
- * i_mutex. If there is an error, the caller will clean up the i_mutex
- * holders via configfs_detach_rollback().
+ * attributes and are removed by rmdir(). We recurse, setting
+ * CONFIGFS_USET_DROPPING on all children that are candidates for
+ * default detach.
+ * If there is an error, the caller will reset the flags via
+ * configfs_detach_rollback().
*/
-static int configfs_detach_prep(struct dentry *dentry)
+static int configfs_detach_prep(struct dentry *dentry, struct mutex **wait_mutex)
{
struct configfs_dirent *parent_sd = dentry->d_fsdata;
struct configfs_dirent *sd;
int ret;
+ /* Mark that we're trying to drop the group */
+ parent_sd->s_type |= CONFIGFS_USET_DROPPING;
+
ret = -EBUSY;
if (!list_empty(&parent_sd->s_links))
goto out;
ret = 0;
list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
- if (sd->s_type & CONFIGFS_NOT_PINNED)
+ if (!sd->s_element ||
+ (sd->s_type & CONFIGFS_NOT_PINNED))
continue;
if (sd->s_type & CONFIGFS_USET_DEFAULT) {
- mutex_lock(&sd->s_dentry->d_inode->i_mutex);
- /* Mark that we've taken i_mutex */
- sd->s_type |= CONFIGFS_USET_DROPPING;
+ /* Abort if racing with mkdir() */
+ if (sd->s_type & CONFIGFS_USET_IN_MKDIR) {
+ if (wait_mutex)
+ *wait_mutex = &sd->s_dentry->d_inode->i_mutex;
+ return -EAGAIN;
+ }
/*
* Yup, recursive. If there's a problem, blame
* deep nesting of default_groups
*/
- ret = configfs_detach_prep(sd->s_dentry);
+ ret = configfs_detach_prep(sd->s_dentry, wait_mutex);
if (!ret)
continue;
} else
@@ -374,7 +464,7 @@ out:
}
/*
- * Walk the tree, dropping i_mutex wherever CONFIGFS_USET_DROPPING is
+ * Walk the tree, resetting CONFIGFS_USET_DROPPING wherever it was
* set.
*/
static void configfs_detach_rollback(struct dentry *dentry)
@@ -382,16 +472,11 @@ static void configfs_detach_rollback(struct dentry *dentry)
struct configfs_dirent *parent_sd = dentry->d_fsdata;
struct configfs_dirent *sd;
- list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
- if (sd->s_type & CONFIGFS_USET_DEFAULT) {
- configfs_detach_rollback(sd->s_dentry);
+ parent_sd->s_type &= ~CONFIGFS_USET_DROPPING;
- if (sd->s_type & CONFIGFS_USET_DROPPING) {
- sd->s_type &= ~CONFIGFS_USET_DROPPING;
- mutex_unlock(&sd->s_dentry->d_inode->i_mutex);
- }
- }
- }
+ list_for_each_entry(sd, &parent_sd->s_children, s_sibling)
+ if (sd->s_type & CONFIGFS_USET_DEFAULT)
+ configfs_detach_rollback(sd->s_dentry);
}
static void detach_attrs(struct config_item * item)
@@ -410,7 +495,9 @@ static void detach_attrs(struct config_item * item)
list_for_each_entry_safe(sd, tmp, &parent_sd->s_children, s_sibling) {
if (!sd->s_element || !(sd->s_type & CONFIGFS_NOT_PINNED))
continue;
+ spin_lock(&configfs_dirent_lock);
list_del_init(&sd->s_sibling);
+ spin_unlock(&configfs_dirent_lock);
configfs_drop_dentry(sd, dentry);
configfs_put(sd);
}
@@ -466,16 +553,12 @@ static void detach_groups(struct config_group *group)
child = sd->s_dentry;
+ mutex_lock(&child->d_inode->i_mutex);
+
configfs_detach_group(sd->s_element);
child->d_inode->i_flags |= S_DEAD;
- /*
- * From rmdir/unregister, a configfs_detach_prep() pass
- * has taken our i_mutex for us. Drop it.
- * From mkdir/register cleanup, there is no sem held.
- */
- if (sd->s_type & CONFIGFS_USET_DROPPING)
- mutex_unlock(&child->d_inode->i_mutex);
+ mutex_unlock(&child->d_inode->i_mutex);
d_delete(child);
dput(child);
@@ -532,36 +615,21 @@ static int create_default_group(struct config_group *parent_group,
static int populate_groups(struct config_group *group)
{
struct config_group *new_group;
- struct dentry *dentry = group->cg_item.ci_dentry;
int ret = 0;
int i;
if (group->default_groups) {
- /*
- * FYI, we're faking mkdir here
- * I'm not sure we need this semaphore, as we're called
- * from our parent's mkdir. That holds our parent's
- * i_mutex, so afaik lookup cannot continue through our
- * parent to find us, let alone mess with our tree.
- * That said, taking our i_mutex is closer to mkdir
- * emulation, and shouldn't hurt.
- */
- mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
-
for (i = 0; group->default_groups[i]; i++) {
new_group = group->default_groups[i];
ret = create_default_group(group, new_group);
- if (ret)
+ if (ret) {
+ detach_groups(group);
break;
+ }
}
-
- mutex_unlock(&dentry->d_inode->i_mutex);
}
- if (ret)
- detach_groups(group);
-
return ret;
}
@@ -676,7 +744,15 @@ static int configfs_attach_item(struct config_item *parent_item,
if (!ret) {
ret = populate_attrs(item);
if (ret) {
+ /*
+ * We are going to remove an inode and its dentry but
+ * the VFS may already have hit and used them. Thus,
+ * we must lock them as rmdir() would.
+ */
+ mutex_lock(&dentry->d_inode->i_mutex);
configfs_remove_dir(item);
+ dentry->d_inode->i_flags |= S_DEAD;
+ mutex_unlock(&dentry->d_inode->i_mutex);
d_delete(dentry);
}
}
@@ -684,6 +760,7 @@ static int configfs_attach_item(struct config_item *parent_item,
return ret;
}
+/* Caller holds the mutex of the item's inode */
static void configfs_detach_item(struct config_item *item)
{
detach_attrs(item);
@@ -702,16 +779,30 @@ static int configfs_attach_group(struct config_item *parent_item,
sd = dentry->d_fsdata;
sd->s_type |= CONFIGFS_USET_DIR;
+ /*
+ * FYI, we're faking mkdir in populate_groups()
+ * We must lock the group's inode to avoid races with the VFS
+ * which can already hit the inode and try to add/remove entries
+ * under it.
+ *
+ * We must also lock the inode to remove it safely in case of
+ * error, as rmdir() would.
+ */
+ mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
ret = populate_groups(to_config_group(item));
if (ret) {
configfs_detach_item(item);
- d_delete(dentry);
+ dentry->d_inode->i_flags |= S_DEAD;
}
+ mutex_unlock(&dentry->d_inode->i_mutex);
+ if (ret)
+ d_delete(dentry);
}
return ret;
}
+/* Caller holds the mutex of the group's inode */
static void configfs_detach_group(struct config_item *item)
{
detach_groups(to_config_group(item));
@@ -1001,14 +1092,15 @@ EXPORT_SYMBOL(configfs_undepend_item);
static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
{
- int ret, module_got = 0;
- struct config_group *group;
- struct config_item *item;
+ int ret = 0;
+ int module_got = 0;
+ struct config_group *group = NULL;
+ struct config_item *item = NULL;
struct config_item *parent_item;
struct configfs_subsystem *subsys;
struct configfs_dirent *sd;
struct config_item_type *type;
- struct module *owner = NULL;
+ struct module *subsys_owner = NULL, *new_item_owner = NULL;
char *name;
if (dentry->d_parent == configfs_sb->s_root) {
@@ -1017,6 +1109,16 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
}
sd = dentry->d_parent->d_fsdata;
+
+ /*
+ * Fake invisibility if dir belongs to a group/default groups hierarchy
+ * being attached
+ */
+ if (!configfs_dirent_is_ready(sd)) {
+ ret = -ENOENT;
+ goto out;
+ }
+
if (!(sd->s_type & CONFIGFS_USET_DIR)) {
ret = -EPERM;
goto out;
@@ -1035,38 +1137,57 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
goto out_put;
}
+ /*
+ * The subsystem may belong to a different module than the item
+ * being created. We don't want to safely pin the new item but
+ * fail to pin the subsystem it sits under.
+ */
+ if (!subsys->su_group.cg_item.ci_type) {
+ ret = -EINVAL;
+ goto out_put;
+ }
+ subsys_owner = subsys->su_group.cg_item.ci_type->ct_owner;
+ if (!try_module_get(subsys_owner)) {
+ ret = -EINVAL;
+ goto out_put;
+ }
+
name = kmalloc(dentry->d_name.len + 1, GFP_KERNEL);
if (!name) {
ret = -ENOMEM;
- goto out_put;
+ goto out_subsys_put;
}
snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name);
mutex_lock(&subsys->su_mutex);
- group = NULL;
- item = NULL;
if (type->ct_group_ops->make_group) {
group = type->ct_group_ops->make_group(to_config_group(parent_item), name);
- if (group) {
+ if (!group)
+ group = ERR_PTR(-ENOMEM);
+ if (!IS_ERR(group)) {
link_group(to_config_group(parent_item), group);
item = &group->cg_item;
- }
+ } else
+ ret = PTR_ERR(group);
} else {
item = type->ct_group_ops->make_item(to_config_group(parent_item), name);
- if (item)
+ if (!item)
+ item = ERR_PTR(-ENOMEM);
+ if (!IS_ERR(item))
link_obj(parent_item, item);
+ else
+ ret = PTR_ERR(item);
}
mutex_unlock(&subsys->su_mutex);
kfree(name);
- if (!item) {
+ if (ret) {
/*
- * If item == NULL, then link_obj() was never called.
+ * If ret != 0, then link_obj() was never called.
* There are no extra references to clean up.
*/
- ret = -ENOMEM;
- goto out_put;
+ goto out_subsys_put;
}
/*
@@ -1080,8 +1201,8 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
goto out_unlink;
}
- owner = type->ct_owner;
- if (!try_module_get(owner)) {
+ new_item_owner = type->ct_owner;
+ if (!try_module_get(new_item_owner)) {
ret = -EINVAL;
goto out_unlink;
}
@@ -1093,11 +1214,28 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
*/
module_got = 1;
+ /*
+ * Make racing rmdir() fail if it did not tag parent with
+ * CONFIGFS_USET_DROPPING
+ * Note: if CONFIGFS_USET_DROPPING is already set, attach_group() will
+ * fail and let rmdir() terminate correctly
+ */
+ spin_lock(&configfs_dirent_lock);
+ /* This will make configfs_detach_prep() fail */
+ sd->s_type |= CONFIGFS_USET_IN_MKDIR;
+ spin_unlock(&configfs_dirent_lock);
+
if (group)
ret = configfs_attach_group(parent_item, item, dentry);
else
ret = configfs_attach_item(parent_item, item, dentry);
+ spin_lock(&configfs_dirent_lock);
+ sd->s_type &= ~CONFIGFS_USET_IN_MKDIR;
+ if (!ret)
+ configfs_dir_set_ready(dentry->d_fsdata);
+ spin_unlock(&configfs_dirent_lock);
+
out_unlink:
if (ret) {
/* Tear down everything we built up */
@@ -1113,9 +1251,13 @@ out_unlink:
mutex_unlock(&subsys->su_mutex);
if (module_got)
- module_put(owner);
+ module_put(new_item_owner);
}
+out_subsys_put:
+ if (ret)
+ module_put(subsys_owner);
+
out_put:
/*
* link_obj()/link_group() took a reference from child->parent,
@@ -1134,7 +1276,7 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
struct config_item *item;
struct configfs_subsystem *subsys;
struct configfs_dirent *sd;
- struct module *owner = NULL;
+ struct module *subsys_owner = NULL, *dead_item_owner = NULL;
int ret;
if (dentry->d_parent == configfs_sb->s_root)
@@ -1161,12 +1303,36 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
return -EINVAL;
}
- ret = configfs_detach_prep(dentry);
- if (ret) {
- configfs_detach_rollback(dentry);
- config_item_put(parent_item);
- return ret;
- }
+ /* configfs_mkdir() shouldn't have allowed this */
+ BUG_ON(!subsys->su_group.cg_item.ci_type);
+ subsys_owner = subsys->su_group.cg_item.ci_type->ct_owner;
+
+ /*
+ * Ensure that no racing symlink() will make detach_prep() fail while
+ * the new link is temporarily attached
+ */
+ do {
+ struct mutex *wait_mutex;
+
+ mutex_lock(&configfs_symlink_mutex);
+ spin_lock(&configfs_dirent_lock);
+ ret = configfs_detach_prep(dentry, &wait_mutex);
+ if (ret)
+ configfs_detach_rollback(dentry);
+ spin_unlock(&configfs_dirent_lock);
+ mutex_unlock(&configfs_symlink_mutex);
+
+ if (ret) {
+ if (ret != -EAGAIN) {
+ config_item_put(parent_item);
+ return ret;
+ }
+
+ /* Wait until the racing operation terminates */
+ mutex_lock(wait_mutex);
+ mutex_unlock(wait_mutex);
+ }
+ } while (ret == -EAGAIN);
/* Get a working ref for the duration of this function */
item = configfs_get_config_item(dentry);
@@ -1175,7 +1341,7 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
config_item_put(parent_item);
if (item->ci_type)
- owner = item->ci_type->ct_owner;
+ dead_item_owner = item->ci_type->ct_owner;
if (sd->s_type & CONFIGFS_USET_DIR) {
configfs_detach_group(item);
@@ -1197,7 +1363,8 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
/* Drop our reference from above */
config_item_put(item);
- module_put(owner);
+ module_put(dead_item_owner);
+ module_put(subsys_owner);
return 0;
}
@@ -1253,13 +1420,24 @@ static int configfs_dir_open(struct inode *inode, struct file *file)
{
struct dentry * dentry = file->f_path.dentry;
struct configfs_dirent * parent_sd = dentry->d_fsdata;
+ int err;
mutex_lock(&dentry->d_inode->i_mutex);
- file->private_data = configfs_new_dirent(parent_sd, NULL);
+ /*
+ * Fake invisibility if dir belongs to a group/default groups hierarchy
+ * being attached
+ */
+ err = -ENOENT;
+ if (configfs_dirent_is_ready(parent_sd)) {
+ file->private_data = configfs_new_dirent(parent_sd, NULL);
+ if (IS_ERR(file->private_data))
+ err = PTR_ERR(file->private_data);
+ else
+ err = 0;
+ }
mutex_unlock(&dentry->d_inode->i_mutex);
- return file->private_data ? 0 : -ENOMEM;
-
+ return err;
}
static int configfs_dir_close(struct inode *inode, struct file *file)
@@ -1268,7 +1446,9 @@ static int configfs_dir_close(struct inode *inode, struct file *file)
struct configfs_dirent * cursor = file->private_data;
mutex_lock(&dentry->d_inode->i_mutex);
+ spin_lock(&configfs_dirent_lock);
list_del_init(&cursor->s_sibling);
+ spin_unlock(&configfs_dirent_lock);
mutex_unlock(&dentry->d_inode->i_mutex);
release_configfs_dirent(cursor);
@@ -1308,7 +1488,9 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir
/* fallthrough */
default:
if (filp->f_pos == 2) {
+ spin_lock(&configfs_dirent_lock);
list_move(q, &parent_sd->s_children);
+ spin_unlock(&configfs_dirent_lock);
}
for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
struct configfs_dirent *next;
@@ -1331,7 +1513,9 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir
dt_type(next)) < 0)
return 0;
+ spin_lock(&configfs_dirent_lock);
list_move(q, p);
+ spin_unlock(&configfs_dirent_lock);
p = q;
filp->f_pos++;
}
@@ -1362,6 +1546,7 @@ static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin)
struct list_head *p;
loff_t n = file->f_pos - 2;
+ spin_lock(&configfs_dirent_lock);
list_del(&cursor->s_sibling);
p = sd->s_children.next;
while (n && p != &sd->s_children) {
@@ -1373,6 +1558,7 @@ static loff_t configfs_dir_lseek(struct file * file, loff_t offset, int origin)
p = p->next;
}
list_add_tail(&cursor->s_sibling, p);
+ spin_unlock(&configfs_dirent_lock);
}
}
mutex_unlock(&dentry->d_inode->i_mutex);
@@ -1422,6 +1608,10 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
if (err) {
d_delete(dentry);
dput(dentry);
+ } else {
+ spin_lock(&configfs_dirent_lock);
+ configfs_dir_set_ready(dentry->d_fsdata);
+ spin_unlock(&configfs_dirent_lock);
}
}
@@ -1448,9 +1638,13 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
mutex_lock_nested(&configfs_sb->s_root->d_inode->i_mutex,
I_MUTEX_PARENT);
mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
- if (configfs_detach_prep(dentry)) {
+ mutex_lock(&configfs_symlink_mutex);
+ spin_lock(&configfs_dirent_lock);
+ if (configfs_detach_prep(dentry, NULL)) {
printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n");
}
+ spin_unlock(&configfs_dirent_lock);
+ mutex_unlock(&configfs_symlink_mutex);
configfs_detach_group(&group->cg_item);
dentry->d_inode->i_flags |= S_DEAD;
mutex_unlock(&dentry->d_inode->i_mutex);
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index b9a1d810346..4803ccc9448 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -247,7 +247,9 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name)
if (!sd->s_element)
continue;
if (!strcmp(configfs_get_name(sd), name)) {
+ spin_lock(&configfs_dirent_lock);
list_del_init(&sd->s_sibling);
+ spin_unlock(&configfs_dirent_lock);
configfs_drop_dentry(sd, dir);
configfs_put(sd);
break;
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 2a731ef5f30..bf74973b049 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -31,6 +31,9 @@
#include <linux/configfs.h>
#include "configfs_internal.h"
+/* Protects attachments of new symlinks */
+DEFINE_MUTEX(configfs_symlink_mutex);
+
static int item_depth(struct config_item * item)
{
struct config_item * p = item;
@@ -73,21 +76,34 @@ static int create_link(struct config_item *parent_item,
struct configfs_symlink *sl;
int ret;
+ ret = -ENOENT;
+ if (!configfs_dirent_is_ready(target_sd))
+ goto out;
ret = -ENOMEM;
sl = kmalloc(sizeof(struct configfs_symlink), GFP_KERNEL);
if (sl) {
sl->sl_target = config_item_get(item);
- /* FIXME: needs a lock, I'd bet */
+ spin_lock(&configfs_dirent_lock);
+ if (target_sd->s_type & CONFIGFS_USET_DROPPING) {
+ spin_unlock(&configfs_dirent_lock);
+ config_item_put(item);
+ kfree(sl);
+ return -ENOENT;
+ }
list_add(&sl->sl_list, &target_sd->s_links);
+ spin_unlock(&configfs_dirent_lock);
ret = configfs_create_link(sl, parent_item->ci_dentry,
dentry);
if (ret) {
+ spin_lock(&configfs_dirent_lock);
list_del_init(&sl->sl_list);
+ spin_unlock(&configfs_dirent_lock);
config_item_put(item);
kfree(sl);
}
}
+out:
return ret;
}
@@ -117,6 +133,7 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna
{
int ret;
struct nameidata nd;
+ struct configfs_dirent *sd;
struct config_item *parent_item;
struct config_item *target_item;
struct config_item_type *type;
@@ -125,9 +142,19 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna
if (dentry->d_parent == configfs_sb->s_root)
goto out;
+ sd = dentry->d_parent->d_fsdata;
+ /*
+ * Fake invisibility if dir belongs to a group/default groups hierarchy
+ * being attached
+ */
+ ret = -ENOENT;
+ if (!configfs_dirent_is_ready(sd))
+ goto out;
+
parent_item = configfs_get_config_item(dentry->d_parent);
type = parent_item->ci_type;
+ ret = -EPERM;
if (!type || !type->ct_item_ops ||
!type->ct_item_ops->allow_link)
goto out_put;
@@ -137,8 +164,14 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna
goto out_put;
ret = type->ct_item_ops->allow_link(parent_item, target_item);
- if (!ret)
+ if (!ret) {
+ mutex_lock(&configfs_symlink_mutex);
ret = create_link(parent_item, target_item, dentry);
+ mutex_unlock(&configfs_symlink_mutex);
+ if (ret && type->ct_item_ops->drop_link)
+ type->ct_item_ops->drop_link(parent_item,
+ target_item);
+ }
config_item_put(target_item);
path_put(&nd.path);
@@ -169,7 +202,9 @@ int configfs_unlink(struct inode *dir, struct dentry *dentry)
parent_item = configfs_get_config_item(dentry->d_parent);
type = parent_item->ci_type;
+ spin_lock(&configfs_dirent_lock);
list_del_init(&sd->s_sibling);
+ spin_unlock(&configfs_dirent_lock);
configfs_drop_dentry(sd, dentry->d_parent);
dput(dentry);
configfs_put(sd);
@@ -184,8 +219,9 @@ int configfs_unlink(struct inode *dir, struct dentry *dentry)
type->ct_item_ops->drop_link(parent_item,
sl->sl_target);
- /* FIXME: Needs lock */
+ spin_lock(&configfs_dirent_lock);
list_del_init(&sl->sl_list);
+ spin_unlock(&configfs_dirent_lock);
/* Put reference from create_link() */
config_item_put(sl->sl_target);
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 0c3b618c15b..f40423eb1a1 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -43,58 +43,13 @@ static DEFINE_MUTEX(read_mutex);
static int cramfs_iget5_test(struct inode *inode, void *opaque)
{
struct cramfs_inode *cramfs_inode = opaque;
-
- if (inode->i_ino != CRAMINO(cramfs_inode))
- return 0; /* does not match */
-
- if (inode->i_ino != 1)
- return 1;
-
- /* all empty directories, char, block, pipe, and sock, share inode #1 */
-
- if ((inode->i_mode != cramfs_inode->mode) ||
- (inode->i_gid != cramfs_inode->gid) ||
- (inode->i_uid != cramfs_inode->uid))
- return 0; /* does not match */
-
- if ((S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) &&
- (inode->i_rdev != old_decode_dev(cramfs_inode->size)))
- return 0; /* does not match */
-
- return 1; /* matches */
+ return inode->i_ino == CRAMINO(cramfs_inode) && inode->i_ino != 1;
}
static int cramfs_iget5_set(struct inode *inode, void *opaque)
{
- static struct timespec zerotime;
struct cramfs_inode *cramfs_inode = opaque;
- inode->i_mode = cramfs_inode->mode;
- inode->i_uid = cramfs_inode->uid;
- inode->i_size = cramfs_inode->size;
- inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
- inode->i_gid = cramfs_inode->gid;
- /* Struct copy intentional */
- inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;
inode->i_ino = CRAMINO(cramfs_inode);
- /* inode->i_nlink is left 1 - arguably wrong for directories,
- but it's the best we can do without reading the directory
- contents. 1 yields the right result in GNU find, even
- without -noleaf option. */
- if (S_ISREG(inode->i_mode)) {
- inode->i_fop = &generic_ro_fops;
- inode->i_data.a_ops = &cramfs_aops;
- } else if (S_ISDIR(inode->i_mode)) {
- inode->i_op = &cramfs_dir_inode_operations;
- inode->i_fop = &cramfs_directory_operations;
- } else if (S_ISLNK(inode->i_mode)) {
- inode->i_op = &page_symlink_inode_operations;
- inode->i_data.a_ops = &cramfs_aops;
- } else {
- inode->i_size = 0;
- inode->i_blocks = 0;
- init_special_inode(inode, inode->i_mode,
- old_decode_dev(cramfs_inode->size));
- }
return 0;
}
@@ -104,12 +59,48 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
struct inode *inode = iget5_locked(sb, CRAMINO(cramfs_inode),
cramfs_iget5_test, cramfs_iget5_set,
cramfs_inode);
+ static struct timespec zerotime;
+
if (inode && (inode->i_state & I_NEW)) {
+ inode->i_mode = cramfs_inode->mode;
+ inode->i_uid = cramfs_inode->uid;
+ inode->i_size = cramfs_inode->size;
+ inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1;
+ inode->i_gid = cramfs_inode->gid;
+ /* Struct copy intentional */
+ inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime;
+ /* inode->i_nlink is left 1 - arguably wrong for directories,
+ but it's the best we can do without reading the directory
+ contents. 1 yields the right result in GNU find, even
+ without -noleaf option. */
+ if (S_ISREG(inode->i_mode)) {
+ inode->i_fop = &generic_ro_fops;
+ inode->i_data.a_ops = &cramfs_aops;
+ } else if (S_ISDIR(inode->i_mode)) {
+ inode->i_op = &cramfs_dir_inode_operations;
+ inode->i_fop = &cramfs_directory_operations;
+ } else if (S_ISLNK(inode->i_mode)) {
+ inode->i_op = &page_symlink_inode_operations;
+ inode->i_data.a_ops = &cramfs_aops;
+ } else {
+ inode->i_size = 0;
+ inode->i_blocks = 0;
+ init_special_inode(inode, inode->i_mode,
+ old_decode_dev(cramfs_inode->size));
+ }
unlock_new_inode(inode);
}
return inode;
}
+static void cramfs_drop_inode(struct inode *inode)
+{
+ if (inode->i_ino == 1)
+ generic_delete_inode(inode);
+ else
+ generic_drop_inode(inode);
+}
+
/*
* We have our own block cache: don't fill up the buffer cache
* with the rom-image, because the way the filesystem is set
@@ -534,6 +525,7 @@ static const struct super_operations cramfs_ops = {
.put_super = cramfs_put_super,
.remount_fs = cramfs_remount,
.statfs = cramfs_statfs,
+ .drop_inode = cramfs_drop_inode,
};
static int cramfs_get_sb(struct file_system_type *fs_type,
diff --git a/fs/dcache.c b/fs/dcache.c
index 3ee588d5f58..e7a1a99b746 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -17,6 +17,7 @@
#include <linux/syscalls.h>
#include <linux/string.h>
#include <linux/mm.h>
+#include <linux/fdtable.h>
#include <linux/fs.h>
#include <linux/fsnotify.h>
#include <linux/slab.h>
@@ -60,7 +61,6 @@ static struct kmem_cache *dentry_cache __read_mostly;
static unsigned int d_hash_mask __read_mostly;
static unsigned int d_hash_shift __read_mostly;
static struct hlist_head *dentry_hashtable __read_mostly;
-static LIST_HEAD(dentry_unused);
/* Statistics gathering. */
struct dentry_stat_t dentry_stat = {
@@ -95,20 +95,13 @@ static void d_free(struct dentry *dentry)
call_rcu(&dentry->d_u.d_rcu, d_callback);
}
-static void dentry_lru_remove(struct dentry *dentry)
-{
- if (!list_empty(&dentry->d_lru)) {
- list_del_init(&dentry->d_lru);
- dentry_stat.nr_unused--;
- }
-}
-
/*
* Release the dentry's inode, using the filesystem
* d_iput() operation if defined.
- * Called with dcache_lock and per dentry lock held, drops both.
*/
static void dentry_iput(struct dentry * dentry)
+ __releases(dentry->d_lock)
+ __releases(dcache_lock)
{
struct inode *inode = dentry->d_inode;
if (inode) {
@@ -128,16 +121,52 @@ static void dentry_iput(struct dentry * dentry)
}
}
+/*
+ * dentry_lru_(add|add_tail|del|del_init) must be called with dcache_lock held.
+ */
+static void dentry_lru_add(struct dentry *dentry)
+{
+ list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
+ dentry->d_sb->s_nr_dentry_unused++;
+ dentry_stat.nr_unused++;
+}
+
+static void dentry_lru_add_tail(struct dentry *dentry)
+{
+ list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
+ dentry->d_sb->s_nr_dentry_unused++;
+ dentry_stat.nr_unused++;
+}
+
+static void dentry_lru_del(struct dentry *dentry)
+{
+ if (!list_empty(&dentry->d_lru)) {
+ list_del(&dentry->d_lru);
+ dentry->d_sb->s_nr_dentry_unused--;
+ dentry_stat.nr_unused--;
+ }
+}
+
+static void dentry_lru_del_init(struct dentry *dentry)
+{
+ if (likely(!list_empty(&dentry->d_lru))) {
+ list_del_init(&dentry->d_lru);
+ dentry->d_sb->s_nr_dentry_unused--;
+ dentry_stat.nr_unused--;
+ }
+}
+
/**
* d_kill - kill dentry and return parent
* @dentry: dentry to kill
*
- * Called with dcache_lock and d_lock, releases both. The dentry must
- * already be unhashed and removed from the LRU.
+ * The dentry must already be unhashed and removed from the LRU.
*
* If this is the root of the dentry tree, return NULL.
*/
static struct dentry *d_kill(struct dentry *dentry)
+ __releases(dentry->d_lock)
+ __releases(dcache_lock)
{
struct dentry *parent;
@@ -209,8 +238,7 @@ repeat:
goto kill_it;
if (list_empty(&dentry->d_lru)) {
dentry->d_flags |= DCACHE_REFERENCED;
- list_add(&dentry->d_lru, &dentry_unused);
- dentry_stat.nr_unused++;
+ dentry_lru_add(dentry);
}
spin_unlock(&dentry->d_lock);
spin_unlock(&dcache_lock);
@@ -219,7 +247,8 @@ repeat:
unhash_it:
__d_drop(dentry);
kill_it:
- dentry_lru_remove(dentry);
+ /* if dentry was on the d_lru list delete it from there */
+ dentry_lru_del(dentry);
dentry = d_kill(dentry);
if (dentry)
goto repeat;
@@ -287,7 +316,7 @@ int d_invalidate(struct dentry * dentry)
static inline struct dentry * __dget_locked(struct dentry *dentry)
{
atomic_inc(&dentry->d_count);
- dentry_lru_remove(dentry);
+ dentry_lru_del_init(dentry);
return dentry;
}
@@ -383,11 +412,11 @@ restart:
* Try to prune ancestors as well. This is necessary to prevent
* quadratic behavior of shrink_dcache_parent(), but is also expected
* to be beneficial in reducing dentry cache fragmentation.
- *
- * Called with dcache_lock, drops it and then regains.
- * Called with dentry->d_lock held, drops it.
*/
static void prune_one_dentry(struct dentry * dentry)
+ __releases(dentry->d_lock)
+ __releases(dcache_lock)
+ __acquires(dcache_lock)
{
__d_drop(dentry);
dentry = d_kill(dentry);
@@ -403,133 +432,168 @@ static void prune_one_dentry(struct dentry * dentry)
if (dentry->d_op && dentry->d_op->d_delete)
dentry->d_op->d_delete(dentry);
- dentry_lru_remove(dentry);
+ dentry_lru_del_init(dentry);
__d_drop(dentry);
dentry = d_kill(dentry);
spin_lock(&dcache_lock);
}
}
-/**
- * prune_dcache - shrink the dcache
- * @count: number of entries to try and free
- * @sb: if given, ignore dentries for other superblocks
- * which are being unmounted.
- *
- * Shrink the dcache. This is done when we need
- * more memory, or simply when we need to unmount
- * something (at which point we need to unuse
- * all dentries).
- *
- * This function may fail to free any resources if
- * all the dentries are in use.
+/*
+ * Shrink the dentry LRU on a given superblock.
+ * @sb : superblock to shrink dentry LRU.
+ * @count: If count is NULL, we prune all dentries on superblock.
+ * @flags: If flags is non-zero, we need to do special processing based on
+ * which flags are set. This means we don't need to maintain multiple
+ * similar copies of this loop.
*/
-
-static void prune_dcache(int count, struct super_block *sb)
+static void __shrink_dcache_sb(struct super_block *sb, int *count, int flags)
{
- spin_lock(&dcache_lock);
- for (; count ; count--) {
- struct dentry *dentry;
- struct list_head *tmp;
- struct rw_semaphore *s_umount;
-
- cond_resched_lock(&dcache_lock);
+ LIST_HEAD(referenced);
+ LIST_HEAD(tmp);
+ struct dentry *dentry;
+ int cnt = 0;
- tmp = dentry_unused.prev;
- if (sb) {
- /* Try to find a dentry for this sb, but don't try
- * too hard, if they aren't near the tail they will
- * be moved down again soon
+ BUG_ON(!sb);
+ BUG_ON((flags & DCACHE_REFERENCED) && count == NULL);
+ spin_lock(&dcache_lock);
+ if (count != NULL)
+ /* called from prune_dcache() and shrink_dcache_parent() */
+ cnt = *count;
+restart:
+ if (count == NULL)
+ list_splice_init(&sb->s_dentry_lru, &tmp);
+ else {
+ while (!list_empty(&sb->s_dentry_lru)) {
+ dentry = list_entry(sb->s_dentry_lru.prev,
+ struct dentry, d_lru);
+ BUG_ON(dentry->d_sb != sb);
+
+ spin_lock(&dentry->d_lock);
+ /*
+ * If we are honouring the DCACHE_REFERENCED flag and
+ * the dentry has this flag set, don't free it. Clear
+ * the flag and put it back on the LRU.
*/
- int skip = count;
- while (skip && tmp != &dentry_unused &&
- list_entry(tmp, struct dentry, d_lru)->d_sb != sb) {
- skip--;
- tmp = tmp->prev;
+ if ((flags & DCACHE_REFERENCED)
+ && (dentry->d_flags & DCACHE_REFERENCED)) {
+ dentry->d_flags &= ~DCACHE_REFERENCED;
+ list_move_tail(&dentry->d_lru, &referenced);
+ spin_unlock(&dentry->d_lock);
+ } else {
+ list_move_tail(&dentry->d_lru, &tmp);
+ spin_unlock(&dentry->d_lock);
+ cnt--;
+ if (!cnt)
+ break;
}
+ cond_resched_lock(&dcache_lock);
}
- if (tmp == &dentry_unused)
- break;
- list_del_init(tmp);
- prefetch(dentry_unused.prev);
- dentry_stat.nr_unused--;
- dentry = list_entry(tmp, struct dentry, d_lru);
-
- spin_lock(&dentry->d_lock);
+ }
+ while (!list_empty(&tmp)) {
+ dentry = list_entry(tmp.prev, struct dentry, d_lru);
+ dentry_lru_del_init(dentry);
+ spin_lock(&dentry->d_lock);
/*
* We found an inuse dentry which was not removed from
- * dentry_unused because of laziness during lookup. Do not free
- * it - just keep it off the dentry_unused list.
+ * the LRU because of laziness during lookup. Do not free
+ * it - just keep it off the LRU list.
*/
- if (atomic_read(&dentry->d_count)) {
- spin_unlock(&dentry->d_lock);
+ if (atomic_read(&dentry->d_count)) {
+ spin_unlock(&dentry->d_lock);
continue;
}
- /* If the dentry was recently referenced, don't free it. */
- if (dentry->d_flags & DCACHE_REFERENCED) {
- dentry->d_flags &= ~DCACHE_REFERENCED;
- list_add(&dentry->d_lru, &dentry_unused);
- dentry_stat.nr_unused++;
- spin_unlock(&dentry->d_lock);
+ prune_one_dentry(dentry);
+ /* dentry->d_lock was dropped in prune_one_dentry() */
+ cond_resched_lock(&dcache_lock);
+ }
+ if (count == NULL && !list_empty(&sb->s_dentry_lru))
+ goto restart;
+ if (count != NULL)
+ *count = cnt;
+ if (!list_empty(&referenced))
+ list_splice(&referenced, &sb->s_dentry_lru);
+ spin_unlock(&dcache_lock);
+}
+
+/**
+ * prune_dcache - shrink the dcache
+ * @count: number of entries to try to free
+ *
+ * Shrink the dcache. This is done when we need more memory, or simply when we
+ * need to unmount something (at which point we need to unuse all dentries).
+ *
+ * This function may fail to free any resources if all the dentries are in use.
+ */
+static void prune_dcache(int count)
+{
+ struct super_block *sb;
+ int w_count;
+ int unused = dentry_stat.nr_unused;
+ int prune_ratio;
+ int pruned;
+
+ if (unused == 0 || count == 0)
+ return;
+ spin_lock(&dcache_lock);
+restart:
+ if (count >= unused)
+ prune_ratio = 1;
+ else
+ prune_ratio = unused / count;
+ spin_lock(&sb_lock);
+ list_for_each_entry(sb, &super_blocks, s_list) {
+ if (sb->s_nr_dentry_unused == 0)
continue;
- }
- /*
- * If the dentry is not DCACHED_REFERENCED, it is time
- * to remove it from the dcache, provided the super block is
- * NULL (which means we are trying to reclaim memory)
- * or this dentry belongs to the same super block that
- * we want to shrink.
+ sb->s_count++;
+ /* Now, we reclaim unused dentrins with fairness.
+ * We reclaim them same percentage from each superblock.
+ * We calculate number of dentries to scan on this sb
+ * as follows, but the implementation is arranged to avoid
+ * overflows:
+ * number of dentries to scan on this sb =
+ * count * (number of dentries on this sb /
+ * number of dentries in the machine)
*/
+ spin_unlock(&sb_lock);
+ if (prune_ratio != 1)
+ w_count = (sb->s_nr_dentry_unused / prune_ratio) + 1;
+ else
+ w_count = sb->s_nr_dentry_unused;
+ pruned = w_count;
/*
- * If this dentry is for "my" filesystem, then I can prune it
- * without taking the s_umount lock (I already hold it).
- */
- if (sb && dentry->d_sb == sb) {
- prune_one_dentry(dentry);
- continue;
- }
- /*
- * ...otherwise we need to be sure this filesystem isn't being
- * unmounted, otherwise we could race with
- * generic_shutdown_super(), and end up holding a reference to
- * an inode while the filesystem is unmounted.
- * So we try to get s_umount, and make sure s_root isn't NULL.
- * (Take a local copy of s_umount to avoid a use-after-free of
- * `dentry').
+ * We need to be sure this filesystem isn't being unmounted,
+ * otherwise we could race with generic_shutdown_super(), and
+ * end up holding a reference to an inode while the filesystem
+ * is unmounted. So we try to get s_umount, and make sure
+ * s_root isn't NULL.
*/
- s_umount = &dentry->d_sb->s_umount;
- if (down_read_trylock(s_umount)) {
- if (dentry->d_sb->s_root != NULL) {
- prune_one_dentry(dentry);
- up_read(s_umount);
- continue;
+ if (down_read_trylock(&sb->s_umount)) {
+ if ((sb->s_root != NULL) &&
+ (!list_empty(&sb->s_dentry_lru))) {
+ spin_unlock(&dcache_lock);
+ __shrink_dcache_sb(sb, &w_count,
+ DCACHE_REFERENCED);
+ pruned -= w_count;
+ spin_lock(&dcache_lock);
}
- up_read(s_umount);
+ up_read(&sb->s_umount);
}
- spin_unlock(&dentry->d_lock);
+ spin_lock(&sb_lock);
+ count -= pruned;
/*
- * Insert dentry at the head of the list as inserting at the
- * tail leads to a cycle.
+ * restart only when sb is no longer on the list and
+ * we have more work to do.
*/
- list_add(&dentry->d_lru, &dentry_unused);
- dentry_stat.nr_unused++;
+ if (__put_super_and_need_restart(sb) && count > 0) {
+ spin_unlock(&sb_lock);
+ goto restart;
+ }
}
+ spin_unlock(&sb_lock);
spin_unlock(&dcache_lock);
}
-/*
- * Shrink the dcache for the specified super block.
- * This allows us to unmount a device without disturbing
- * the dcache for the other devices.
- *
- * This implementation makes just two traversals of the
- * unused list. On the first pass we move the selected
- * dentries to the most recent end, and on the second
- * pass we free them. The second pass must restart after
- * each dput(), but since the target dentries are all at
- * the end, it's really just a single traversal.
- */
-
/**
* shrink_dcache_sb - shrink dcache for a superblock
* @sb: superblock
@@ -538,44 +602,9 @@ static void prune_dcache(int count, struct super_block *sb)
* is used to free the dcache before unmounting a file
* system
*/
-
void shrink_dcache_sb(struct super_block * sb)
{
- struct list_head *tmp, *next;
- struct dentry *dentry;
-
- /*
- * Pass one ... move the dentries for the specified
- * superblock to the most recent end of the unused list.
- */
- spin_lock(&dcache_lock);
- list_for_each_prev_safe(tmp, next, &dentry_unused) {
- dentry = list_entry(tmp, struct dentry, d_lru);
- if (dentry->d_sb != sb)
- continue;
- list_move_tail(tmp, &dentry_unused);
- }
-
- /*
- * Pass two ... free the dentries for this superblock.
- */
-repeat:
- list_for_each_prev_safe(tmp, next, &dentry_unused) {
- dentry = list_entry(tmp, struct dentry, d_lru);
- if (dentry->d_sb != sb)
- continue;
- dentry_stat.nr_unused--;
- list_del_init(tmp);
- spin_lock(&dentry->d_lock);
- if (atomic_read(&dentry->d_count)) {
- spin_unlock(&dentry->d_lock);
- continue;
- }
- prune_one_dentry(dentry);
- cond_resched_lock(&dcache_lock);
- goto repeat;
- }
- spin_unlock(&dcache_lock);
+ __shrink_dcache_sb(sb, NULL, 0);
}
/*
@@ -592,7 +621,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
/* detach this root from the system */
spin_lock(&dcache_lock);
- dentry_lru_remove(dentry);
+ dentry_lru_del_init(dentry);
__d_drop(dentry);
spin_unlock(&dcache_lock);
@@ -606,7 +635,7 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
spin_lock(&dcache_lock);
list_for_each_entry(loop, &dentry->d_subdirs,
d_u.d_child) {
- dentry_lru_remove(loop);
+ dentry_lru_del_init(loop);
__d_drop(loop);
cond_resched_lock(&dcache_lock);
}
@@ -788,14 +817,13 @@ resume:
struct dentry *dentry = list_entry(tmp, struct dentry, d_u.d_child);
next = tmp->next;
- dentry_lru_remove(dentry);
+ dentry_lru_del_init(dentry);
/*
* move only zero ref count dentries to the end
* of the unused list for prune_dcache
*/
if (!atomic_read(&dentry->d_count)) {
- list_add_tail(&dentry->d_lru, &dentry_unused);
- dentry_stat.nr_unused++;
+ dentry_lru_add_tail(dentry);
found++;
}
@@ -837,10 +865,11 @@ out:
void shrink_dcache_parent(struct dentry * parent)
{
+ struct super_block *sb = parent->d_sb;
int found;
while ((found = select_parent(parent)) != 0)
- prune_dcache(found, parent->d_sb);
+ __shrink_dcache_sb(sb, &found, 0);
}
/*
@@ -860,7 +889,7 @@ static int shrink_dcache_memory(int nr, gfp_t gfp_mask)
if (nr) {
if (!(gfp_mask & __GFP_FS))
return -1;
- prune_dcache(nr, NULL);
+ prune_dcache(nr);
}
return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
}
@@ -1191,6 +1220,107 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
return new;
}
+/**
+ * d_add_ci - lookup or allocate new dentry with case-exact name
+ * @inode: the inode case-insensitive lookup has found
+ * @dentry: the negative dentry that was passed to the parent's lookup func
+ * @name: the case-exact name to be associated with the returned dentry
+ *
+ * This is to avoid filling the dcache with case-insensitive names to the
+ * same inode, only the actual correct case is stored in the dcache for
+ * case-insensitive filesystems.
+ *
+ * For a case-insensitive lookup match and if the the case-exact dentry
+ * already exists in in the dcache, use it and return it.
+ *
+ * If no entry exists with the exact case name, allocate new dentry with
+ * the exact case, and return the spliced entry.
+ */
+struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
+ struct qstr *name)
+{
+ int error;
+ struct dentry *found;
+ struct dentry *new;
+
+ /* Does a dentry matching the name exist already? */
+ found = d_hash_and_lookup(dentry->d_parent, name);
+ /* If not, create it now and return */
+ if (!found) {
+ new = d_alloc(dentry->d_parent, name);
+ if (!new) {
+ error = -ENOMEM;
+ goto err_out;
+ }
+ found = d_splice_alias(inode, new);
+ if (found) {
+ dput(new);
+ return found;
+ }
+ return new;
+ }
+ /* Matching dentry exists, check if it is negative. */
+ if (found->d_inode) {
+ if (unlikely(found->d_inode != inode)) {
+ /* This can't happen because bad inodes are unhashed. */
+ BUG_ON(!is_bad_inode(inode));
+ BUG_ON(!is_bad_inode(found->d_inode));
+ }
+ /*
+ * Already have the inode and the dentry attached, decrement
+ * the reference count to balance the iget() done
+ * earlier on. We found the dentry using d_lookup() so it
+ * cannot be disconnected and thus we do not need to worry
+ * about any NFS/disconnectedness issues here.
+ */
+ iput(inode);
+ return found;
+ }
+ /*
+ * Negative dentry: instantiate it unless the inode is a directory and
+ * has a 'disconnected' dentry (i.e. IS_ROOT and DCACHE_DISCONNECTED),
+ * in which case d_move() that in place of the found dentry.
+ */
+ if (!S_ISDIR(inode->i_mode)) {
+ /* Not a directory; everything is easy. */
+ d_instantiate(found, inode);
+ return found;
+ }
+ spin_lock(&dcache_lock);
+ if (list_empty(&inode->i_dentry)) {
+ /*
+ * Directory without a 'disconnected' dentry; we need to do
+ * d_instantiate() by hand because it takes dcache_lock which
+ * we already hold.
+ */
+ list_add(&found->d_alias, &inode->i_dentry);
+ found->d_inode = inode;
+ spin_unlock(&dcache_lock);
+ security_d_instantiate(found, inode);
+ return found;
+ }
+ /*
+ * Directory with a 'disconnected' dentry; get a reference to the
+ * 'disconnected' dentry.
+ */
+ new = list_entry(inode->i_dentry.next, struct dentry, d_alias);
+ dget_locked(new);
+ spin_unlock(&dcache_lock);
+ /* Do security vodoo. */
+ security_d_instantiate(found, inode);
+ /* Move new in place of found. */
+ d_move(new, found);
+ /* Balance the iget() we did above. */
+ iput(inode);
+ /* Throw away found. */
+ dput(found);
+ /* Use new as the actual dentry. */
+ return new;
+
+err_out:
+ iput(inode);
+ return ERR_PTR(error);
+}
/**
* d_lookup - search for a dentry
@@ -1212,7 +1342,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
* rcu_read_lock() and rcu_read_unlock() are used to disable preemption while
* lookup is going on.
*
- * dentry_unused list is not updated even if lookup finds the required dentry
+ * The dentry unused LRU is not updated even if lookup finds the required dentry
* in there. It is updated in places such as prune_dcache, shrink_dcache_sb,
* select_parent and __dget_locked. This laziness saves lookup from dcache_lock
* acquisition.
@@ -1265,6 +1395,10 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
if (dentry->d_parent != parent)
goto next;
+ /* non-existing due to RCU? */
+ if (d_unhashed(dentry))
+ goto next;
+
/*
* It is safe to compare names since d_move() cannot
* change the qstr (protected by d_lock).
@@ -1280,10 +1414,8 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
goto next;
}
- if (!d_unhashed(dentry)) {
- atomic_inc(&dentry->d_count);
- found = dentry;
- }
+ atomic_inc(&dentry->d_count);
+ found = dentry;
spin_unlock(&dentry->d_lock);
break;
next:
@@ -1604,10 +1736,9 @@ static int d_isparent(struct dentry *p1, struct dentry *p2)
*
* Note: If ever the locking in lock_rename() changes, then please
* remember to update this too...
- *
- * On return, dcache_lock will have been unlocked.
*/
static struct dentry *__d_unalias(struct dentry *dentry, struct dentry *alias)
+ __releases(dcache_lock)
{
struct mutex *m1 = NULL, *m2 = NULL;
struct dentry *ret;
@@ -1743,11 +1874,9 @@ out_nolock:
shouldnt_be_hashed:
spin_unlock(&dcache_lock);
BUG();
- goto shouldnt_be_hashed;
}
-static int prepend(char **buffer, int *buflen, const char *str,
- int namelen)
+static int prepend(char **buffer, int *buflen, const char *str, int namelen)
{
*buflen -= namelen;
if (*buflen < 0)
@@ -1757,8 +1886,13 @@ static int prepend(char **buffer, int *buflen, const char *str,
return 0;
}
+static int prepend_name(char **buffer, int *buflen, struct qstr *name)
+{
+ return prepend(buffer, buflen, name->name, name->len);
+}
+
/**
- * d_path - return the path of a dentry
+ * __d_path - return the path of a dentry
* @path: the dentry/vfsmount to report
* @root: root vfsmnt/dentry (may be modified by this function)
* @buffer: buffer to return value in
@@ -1779,9 +1913,10 @@ char *__d_path(const struct path *path, struct path *root,
{
struct dentry *dentry = path->dentry;
struct vfsmount *vfsmnt = path->mnt;
- char * end = buffer+buflen;
- char * retval;
+ char *end = buffer + buflen;
+ char *retval;
+ spin_lock(&vfsmount_lock);
prepend(&end, &buflen, "\0", 1);
if (!IS_ROOT(dentry) && d_unhashed(dentry) &&
(prepend(&end, &buflen, " (deleted)", 10) != 0))
@@ -1800,38 +1935,37 @@ char *__d_path(const struct path *path, struct path *root,
break;
if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
/* Global root? */
- spin_lock(&vfsmount_lock);
if (vfsmnt->mnt_parent == vfsmnt) {
- spin_unlock(&vfsmount_lock);
goto global_root;
}
dentry = vfsmnt->mnt_mountpoint;
vfsmnt = vfsmnt->mnt_parent;
- spin_unlock(&vfsmount_lock);
continue;
}
parent = dentry->d_parent;
prefetch(parent);
- if ((prepend(&end, &buflen, dentry->d_name.name,
- dentry->d_name.len) != 0) ||
+ if ((prepend_name(&end, &buflen, &dentry->d_name) != 0) ||
(prepend(&end, &buflen, "/", 1) != 0))
goto Elong;
retval = end;
dentry = parent;
}
+out:
+ spin_unlock(&vfsmount_lock);
return retval;
global_root:
retval += 1; /* hit the slash */
- if (prepend(&retval, &buflen, dentry->d_name.name,
- dentry->d_name.len) != 0)
+ if (prepend_name(&retval, &buflen, &dentry->d_name) != 0)
goto Elong;
root->mnt = vfsmnt;
root->dentry = dentry;
- return retval;
+ goto out;
+
Elong:
- return ERR_PTR(-ENAMETOOLONG);
+ retval = ERR_PTR(-ENAMETOOLONG);
+ goto out;
}
/**
@@ -1845,9 +1979,9 @@ Elong:
*
* Returns the buffer or an error code if the path was too long.
*
- * "buflen" should be positive. Caller holds the dcache_lock.
+ * "buflen" should be positive.
*/
-char *d_path(struct path *path, char *buf, int buflen)
+char *d_path(const struct path *path, char *buf, int buflen)
{
char *res;
struct path root;
@@ -1915,16 +2049,11 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
retval = end-1;
*retval = '/';
- for (;;) {
- struct dentry *parent;
- if (IS_ROOT(dentry))
- break;
+ while (!IS_ROOT(dentry)) {
+ struct dentry *parent = dentry->d_parent;
- parent = dentry->d_parent;
prefetch(parent);
-
- if ((prepend(&end, &buflen, dentry->d_name.name,
- dentry->d_name.len) != 0) ||
+ if ((prepend_name(&end, &buflen, &dentry->d_name) != 0) ||
(prepend(&end, &buflen, "/", 1) != 0))
goto Elong;
@@ -1975,7 +2104,7 @@ asmlinkage long sys_getcwd(char __user *buf, unsigned long size)
error = -ENOENT;
/* Has the current directory has been unlinked? */
spin_lock(&dcache_lock);
- if (pwd.dentry->d_parent == pwd.dentry || !d_unhashed(pwd.dentry)) {
+ if (IS_ROOT(pwd.dentry) || !d_unhashed(pwd.dentry)) {
unsigned long len;
struct path tmp = root;
char * cwd;
@@ -2228,6 +2357,7 @@ EXPORT_SYMBOL(d_path);
EXPORT_SYMBOL(d_prune_aliases);
EXPORT_SYMBOL(d_rehash);
EXPORT_SYMBOL(d_splice_alias);
+EXPORT_SYMBOL(d_add_ci);
EXPORT_SYMBOL(d_validate);
EXPORT_SYMBOL(dget_locked);
EXPORT_SYMBOL(dput);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index e9602d85c11..3dbe2169cf3 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -26,8 +26,7 @@
#include <linux/debugfs.h>
#include <linux/fsnotify.h>
#include <linux/string.h>
-
-#define DEBUGFS_MAGIC 0x64626720
+#include <linux/magic.h>
static struct vfsmount *debugfs_mount;
static int debugfs_mount_count;
@@ -309,6 +308,31 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
}
EXPORT_SYMBOL_GPL(debugfs_create_symlink);
+static void __debugfs_remove(struct dentry *dentry, struct dentry *parent)
+{
+ int ret = 0;
+
+ if (debugfs_positive(dentry)) {
+ if (dentry->d_inode) {
+ dget(dentry);
+ switch (dentry->d_inode->i_mode & S_IFMT) {
+ case S_IFDIR:
+ ret = simple_rmdir(parent->d_inode, dentry);
+ break;
+ case S_IFLNK:
+ kfree(dentry->d_inode->i_private);
+ /* fall through */
+ default:
+ simple_unlink(parent->d_inode, dentry);
+ break;
+ }
+ if (!ret)
+ d_delete(dentry);
+ dput(dentry);
+ }
+ }
+}
+
/**
* debugfs_remove - removes a file or directory from the debugfs filesystem
* @dentry: a pointer to a the dentry of the file or directory to be
@@ -325,7 +349,6 @@ EXPORT_SYMBOL_GPL(debugfs_create_symlink);
void debugfs_remove(struct dentry *dentry)
{
struct dentry *parent;
- int ret = 0;
if (!dentry)
return;
@@ -335,29 +358,83 @@ void debugfs_remove(struct dentry *dentry)
return;
mutex_lock(&parent->d_inode->i_mutex);
- if (debugfs_positive(dentry)) {
- if (dentry->d_inode) {
- dget(dentry);
- switch (dentry->d_inode->i_mode & S_IFMT) {
- case S_IFDIR:
- ret = simple_rmdir(parent->d_inode, dentry);
- break;
- case S_IFLNK:
- kfree(dentry->d_inode->i_private);
- /* fall through */
- default:
- simple_unlink(parent->d_inode, dentry);
+ __debugfs_remove(dentry, parent);
+ mutex_unlock(&parent->d_inode->i_mutex);
+ simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+}
+EXPORT_SYMBOL_GPL(debugfs_remove);
+
+/**
+ * debugfs_remove_recursive - recursively removes a directory
+ * @dentry: a pointer to a the dentry of the directory to be removed.
+ *
+ * This function recursively removes a directory tree in debugfs that
+ * was previously created with a call to another debugfs function
+ * (like debugfs_create_file() or variants thereof.)
+ *
+ * This function is required to be called in order for the file to be
+ * removed, no automatic cleanup of files will happen when a module is
+ * removed, you are responsible here.
+ */
+void debugfs_remove_recursive(struct dentry *dentry)
+{
+ struct dentry *child;
+ struct dentry *parent;
+
+ if (!dentry)
+ return;
+
+ parent = dentry->d_parent;
+ if (!parent || !parent->d_inode)
+ return;
+
+ parent = dentry;
+ mutex_lock(&parent->d_inode->i_mutex);
+
+ while (1) {
+ /*
+ * When all dentries under "parent" has been removed,
+ * walk up the tree until we reach our starting point.
+ */
+ if (list_empty(&parent->d_subdirs)) {
+ mutex_unlock(&parent->d_inode->i_mutex);
+ if (parent == dentry)
break;
- }
- if (!ret)
- d_delete(dentry);
- dput(dentry);
+ parent = parent->d_parent;
+ mutex_lock(&parent->d_inode->i_mutex);
+ }
+ child = list_entry(parent->d_subdirs.next, struct dentry,
+ d_u.d_child);
+
+ /*
+ * If "child" isn't empty, walk down the tree and
+ * remove all its descendants first.
+ */
+ if (!list_empty(&child->d_subdirs)) {
+ mutex_unlock(&parent->d_inode->i_mutex);
+ parent = child;
+ mutex_lock(&parent->d_inode->i_mutex);
+ continue;
}
+ __debugfs_remove(child, parent);
+ if (parent->d_subdirs.next == &child->d_u.d_child) {
+ /*
+ * Avoid infinite loop if we fail to remove
+ * one dentry.
+ */
+ mutex_unlock(&parent->d_inode->i_mutex);
+ break;
+ }
+ simple_release_fs(&debugfs_mount, &debugfs_mount_count);
}
+
+ parent = dentry->d_parent;
+ mutex_lock(&parent->d_inode->i_mutex);
+ __debugfs_remove(dentry, parent);
mutex_unlock(&parent->d_inode->i_mutex);
simple_release_fs(&debugfs_mount, &debugfs_mount_count);
}
-EXPORT_SYMBOL_GPL(debugfs_remove);
+EXPORT_SYMBOL_GPL(debugfs_remove_recursive);
/**
* debugfs_rename - rename a file/directory in the debugfs filesystem
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 285b64a8b06..4a714f6c1be 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -27,9 +27,10 @@
#define DEVPTS_SUPER_MAGIC 0x1cd1
#define DEVPTS_DEFAULT_MODE 0600
+#define PTMX_MINOR 2
extern int pty_limit; /* Config limit on Unix98 ptys */
-static DEFINE_IDR(allocated_ptys);
+static DEFINE_IDA(allocated_ptys);
static DEFINE_MUTEX(allocated_ptys_lock);
static struct vfsmount *devpts_mnt;
@@ -48,7 +49,7 @@ enum {
Opt_err
};
-static match_table_t tokens = {
+static const match_table_t tokens = {
{Opt_uid, "uid=%u"},
{Opt_gid, "gid=%u"},
{Opt_mode, "mode=%o"},
@@ -169,35 +170,27 @@ static struct file_system_type devpts_fs_type = {
* to the System V naming convention
*/
-static struct dentry *get_node(int num)
-{
- char s[12];
- struct dentry *root = devpts_root;
- mutex_lock(&root->d_inode->i_mutex);
- return lookup_one_len(s, root, sprintf(s, "%d", num));
-}
-
-int devpts_new_index(void)
+int devpts_new_index(struct inode *ptmx_inode)
{
int index;
- int idr_ret;
+ int ida_ret;
retry:
- if (!idr_pre_get(&allocated_ptys, GFP_KERNEL)) {
+ if (!ida_pre_get(&allocated_ptys, GFP_KERNEL)) {
return -ENOMEM;
}
mutex_lock(&allocated_ptys_lock);
- idr_ret = idr_get_new(&allocated_ptys, NULL, &index);
- if (idr_ret < 0) {
+ ida_ret = ida_get_new(&allocated_ptys, &index);
+ if (ida_ret < 0) {
mutex_unlock(&allocated_ptys_lock);
- if (idr_ret == -EAGAIN)
+ if (ida_ret == -EAGAIN)
goto retry;
return -EIO;
}
if (index >= pty_limit) {
- idr_remove(&allocated_ptys, index);
+ ida_remove(&allocated_ptys, index);
mutex_unlock(&allocated_ptys_lock);
return -EIO;
}
@@ -205,20 +198,21 @@ retry:
return index;
}
-void devpts_kill_index(int idx)
+void devpts_kill_index(struct inode *ptmx_inode, int idx)
{
mutex_lock(&allocated_ptys_lock);
- idr_remove(&allocated_ptys, idx);
+ ida_remove(&allocated_ptys, idx);
mutex_unlock(&allocated_ptys_lock);
}
-int devpts_pty_new(struct tty_struct *tty)
+int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
{
int number = tty->index; /* tty layer puts index from devpts_new_index() in here */
struct tty_driver *driver = tty->driver;
dev_t device = MKDEV(driver->major, driver->minor_start+number);
struct dentry *dentry;
struct inode *inode = new_inode(devpts_mnt->mnt_sb);
+ char s[12];
/* We're supposed to be given the slave end of a pty */
BUG_ON(driver->type != TTY_DRIVER_TYPE_PTY);
@@ -233,10 +227,15 @@ int devpts_pty_new(struct tty_struct *tty)
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
init_special_inode(inode, S_IFCHR|config.mode, device);
inode->i_private = tty;
+ tty->driver_data = inode;
- dentry = get_node(number);
- if (!IS_ERR(dentry) && !dentry->d_inode) {
- d_instantiate(dentry, inode);
+ sprintf(s, "%d", number);
+
+ mutex_lock(&devpts_root->d_inode->i_mutex);
+
+ dentry = d_alloc_name(devpts_root, s);
+ if (!IS_ERR(dentry)) {
+ d_add(dentry, inode);
fsnotify_create(devpts_root->d_inode, dentry);
}
@@ -245,36 +244,31 @@ int devpts_pty_new(struct tty_struct *tty)
return 0;
}
-struct tty_struct *devpts_get_tty(int number)
+struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number)
{
- struct dentry *dentry = get_node(number);
- struct tty_struct *tty;
-
- tty = NULL;
- if (!IS_ERR(dentry)) {
- if (dentry->d_inode)
- tty = dentry->d_inode->i_private;
- dput(dentry);
- }
+ BUG_ON(pts_inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR));
- mutex_unlock(&devpts_root->d_inode->i_mutex);
-
- return tty;
+ if (pts_inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
+ return (struct tty_struct *)pts_inode->i_private;
+ return NULL;
}
-void devpts_pty_kill(int number)
+void devpts_pty_kill(struct tty_struct *tty)
{
- struct dentry *dentry = get_node(number);
+ struct inode *inode = tty->driver_data;
+ struct dentry *dentry;
- if (!IS_ERR(dentry)) {
- struct inode *inode = dentry->d_inode;
- if (inode) {
- inode->i_nlink--;
- d_delete(dentry);
- dput(dentry);
- }
+ BUG_ON(inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR));
+
+ mutex_lock(&devpts_root->d_inode->i_mutex);
+
+ dentry = d_find_alias(inode);
+ if (dentry && !IS_ERR(dentry)) {
+ inode->i_nlink--;
+ d_delete(dentry);
dput(dentry);
}
+
mutex_unlock(&devpts_root->d_inode->i_mutex);
}
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 9e81addbd6e..9606ee848fd 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -150,17 +150,11 @@ static int dio_refill_pages(struct dio *dio)
int nr_pages;
nr_pages = min(dio->total_pages - dio->curr_page, DIO_PAGES);
- down_read(&current->mm->mmap_sem);
- ret = get_user_pages(
- current, /* Task for fault acounting */
- current->mm, /* whose pages? */
+ ret = get_user_pages_fast(
dio->curr_user_address, /* Where from? */
nr_pages, /* How many pages? */
dio->rw == READ, /* Write to memory? */
- 0, /* force (?) */
- &dio->pages[0],
- NULL); /* vmas */
- up_read(&current->mm->mmap_sem);
+ &dio->pages[0]); /* Put results here */
if (ret < 0 && dio->blocks_available && (dio->rw & WRITE)) {
struct page *page = ZERO_PAGE(0);
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index eac23bd288b..fd9859f92fa 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -2,7 +2,7 @@
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
-** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
+** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
@@ -14,6 +14,9 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/configfs.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <net/ipv6.h>
#include <net/sock.h>
#include "config.h"
@@ -30,16 +33,16 @@
static struct config_group *space_list;
static struct config_group *comm_list;
-static struct comm *local_comm;
+static struct dlm_comm *local_comm;
-struct clusters;
-struct cluster;
-struct spaces;
-struct space;
-struct comms;
-struct comm;
-struct nodes;
-struct node;
+struct dlm_clusters;
+struct dlm_cluster;
+struct dlm_spaces;
+struct dlm_space;
+struct dlm_comms;
+struct dlm_comm;
+struct dlm_nodes;
+struct dlm_node;
static struct config_group *make_cluster(struct config_group *, const char *);
static void drop_cluster(struct config_group *, struct config_item *);
@@ -68,17 +71,22 @@ static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
const char *buf, size_t len);
-static ssize_t comm_nodeid_read(struct comm *cm, char *buf);
-static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len);
-static ssize_t comm_local_read(struct comm *cm, char *buf);
-static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len);
-static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len);
-static ssize_t node_nodeid_read(struct node *nd, char *buf);
-static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len);
-static ssize_t node_weight_read(struct node *nd, char *buf);
-static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len);
-
-struct cluster {
+static ssize_t comm_nodeid_read(struct dlm_comm *cm, char *buf);
+static ssize_t comm_nodeid_write(struct dlm_comm *cm, const char *buf,
+ size_t len);
+static ssize_t comm_local_read(struct dlm_comm *cm, char *buf);
+static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf,
+ size_t len);
+static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf,
+ size_t len);
+static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf);
+static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf,
+ size_t len);
+static ssize_t node_weight_read(struct dlm_node *nd, char *buf);
+static ssize_t node_weight_write(struct dlm_node *nd, const char *buf,
+ size_t len);
+
+struct dlm_cluster {
struct config_group group;
unsigned int cl_tcp_port;
unsigned int cl_buffer_size;
@@ -109,11 +117,11 @@ enum {
struct cluster_attribute {
struct configfs_attribute attr;
- ssize_t (*show)(struct cluster *, char *);
- ssize_t (*store)(struct cluster *, const char *, size_t);
+ ssize_t (*show)(struct dlm_cluster *, char *);
+ ssize_t (*store)(struct dlm_cluster *, const char *, size_t);
};
-static ssize_t cluster_set(struct cluster *cl, unsigned int *cl_field,
+static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
int *info_field, int check_zero,
const char *buf, size_t len)
{
@@ -134,12 +142,12 @@ static ssize_t cluster_set(struct cluster *cl, unsigned int *cl_field,
}
#define CLUSTER_ATTR(name, check_zero) \
-static ssize_t name##_write(struct cluster *cl, const char *buf, size_t len) \
+static ssize_t name##_write(struct dlm_cluster *cl, const char *buf, size_t len) \
{ \
return cluster_set(cl, &cl->cl_##name, &dlm_config.ci_##name, \
check_zero, buf, len); \
} \
-static ssize_t name##_read(struct cluster *cl, char *buf) \
+static ssize_t name##_read(struct dlm_cluster *cl, char *buf) \
{ \
return snprintf(buf, PAGE_SIZE, "%u\n", cl->cl_##name); \
} \
@@ -181,8 +189,8 @@ enum {
struct comm_attribute {
struct configfs_attribute attr;
- ssize_t (*show)(struct comm *, char *);
- ssize_t (*store)(struct comm *, const char *, size_t);
+ ssize_t (*show)(struct dlm_comm *, char *);
+ ssize_t (*store)(struct dlm_comm *, const char *, size_t);
};
static struct comm_attribute comm_attr_nodeid = {
@@ -222,8 +230,8 @@ enum {
struct node_attribute {
struct configfs_attribute attr;
- ssize_t (*show)(struct node *, char *);
- ssize_t (*store)(struct node *, const char *, size_t);
+ ssize_t (*show)(struct dlm_node *, char *);
+ ssize_t (*store)(struct dlm_node *, const char *, size_t);
};
static struct node_attribute node_attr_nodeid = {
@@ -248,26 +256,26 @@ static struct configfs_attribute *node_attrs[] = {
NULL,
};
-struct clusters {
+struct dlm_clusters {
struct configfs_subsystem subsys;
};
-struct spaces {
+struct dlm_spaces {
struct config_group ss_group;
};
-struct space {
+struct dlm_space {
struct config_group group;
struct list_head members;
struct mutex members_lock;
int members_count;
};
-struct comms {
+struct dlm_comms {
struct config_group cs_group;
};
-struct comm {
+struct dlm_comm {
struct config_item item;
int nodeid;
int local;
@@ -275,11 +283,11 @@ struct comm {
struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT];
};
-struct nodes {
+struct dlm_nodes {
struct config_group ns_group;
};
-struct node {
+struct dlm_node {
struct config_item item;
struct list_head list; /* space->members */
int nodeid;
@@ -372,38 +380,40 @@ static struct config_item_type node_type = {
.ct_owner = THIS_MODULE,
};
-static struct cluster *to_cluster(struct config_item *i)
+static struct dlm_cluster *config_item_to_cluster(struct config_item *i)
{
- return i ? container_of(to_config_group(i), struct cluster, group):NULL;
+ return i ? container_of(to_config_group(i), struct dlm_cluster, group) :
+ NULL;
}
-static struct space *to_space(struct config_item *i)
+static struct dlm_space *config_item_to_space(struct config_item *i)
{
- return i ? container_of(to_config_group(i), struct space, group) : NULL;
+ return i ? container_of(to_config_group(i), struct dlm_space, group) :
+ NULL;
}
-static struct comm *to_comm(struct config_item *i)
+static struct dlm_comm *config_item_to_comm(struct config_item *i)
{
- return i ? container_of(i, struct comm, item) : NULL;
+ return i ? container_of(i, struct dlm_comm, item) : NULL;
}
-static struct node *to_node(struct config_item *i)
+static struct dlm_node *config_item_to_node(struct config_item *i)
{
- return i ? container_of(i, struct node, item) : NULL;
+ return i ? container_of(i, struct dlm_node, item) : NULL;
}
static struct config_group *make_cluster(struct config_group *g,
const char *name)
{
- struct cluster *cl = NULL;
- struct spaces *sps = NULL;
- struct comms *cms = NULL;
+ struct dlm_cluster *cl = NULL;
+ struct dlm_spaces *sps = NULL;
+ struct dlm_comms *cms = NULL;
void *gps = NULL;
- cl = kzalloc(sizeof(struct cluster), GFP_KERNEL);
+ cl = kzalloc(sizeof(struct dlm_cluster), GFP_KERNEL);
gps = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL);
- sps = kzalloc(sizeof(struct spaces), GFP_KERNEL);
- cms = kzalloc(sizeof(struct comms), GFP_KERNEL);
+ sps = kzalloc(sizeof(struct dlm_spaces), GFP_KERNEL);
+ cms = kzalloc(sizeof(struct dlm_comms), GFP_KERNEL);
if (!cl || !gps || !sps || !cms)
goto fail;
@@ -438,12 +448,12 @@ static struct config_group *make_cluster(struct config_group *g,
kfree(gps);
kfree(sps);
kfree(cms);
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
static void drop_cluster(struct config_group *g, struct config_item *i)
{
- struct cluster *cl = to_cluster(i);
+ struct dlm_cluster *cl = config_item_to_cluster(i);
struct config_item *tmp;
int j;
@@ -461,20 +471,20 @@ static void drop_cluster(struct config_group *g, struct config_item *i)
static void release_cluster(struct config_item *i)
{
- struct cluster *cl = to_cluster(i);
+ struct dlm_cluster *cl = config_item_to_cluster(i);
kfree(cl->group.default_groups);
kfree(cl);
}
static struct config_group *make_space(struct config_group *g, const char *name)
{
- struct space *sp = NULL;
- struct nodes *nds = NULL;
+ struct dlm_space *sp = NULL;
+ struct dlm_nodes *nds = NULL;
void *gps = NULL;
- sp = kzalloc(sizeof(struct space), GFP_KERNEL);
+ sp = kzalloc(sizeof(struct dlm_space), GFP_KERNEL);
gps = kcalloc(2, sizeof(struct config_group *), GFP_KERNEL);
- nds = kzalloc(sizeof(struct nodes), GFP_KERNEL);
+ nds = kzalloc(sizeof(struct dlm_nodes), GFP_KERNEL);
if (!sp || !gps || !nds)
goto fail;
@@ -495,12 +505,12 @@ static struct config_group *make_space(struct config_group *g, const char *name)
kfree(sp);
kfree(gps);
kfree(nds);
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
static void drop_space(struct config_group *g, struct config_item *i)
{
- struct space *sp = to_space(i);
+ struct dlm_space *sp = config_item_to_space(i);
struct config_item *tmp;
int j;
@@ -517,18 +527,18 @@ static void drop_space(struct config_group *g, struct config_item *i)
static void release_space(struct config_item *i)
{
- struct space *sp = to_space(i);
+ struct dlm_space *sp = config_item_to_space(i);
kfree(sp->group.default_groups);
kfree(sp);
}
static struct config_item *make_comm(struct config_group *g, const char *name)
{
- struct comm *cm;
+ struct dlm_comm *cm;
- cm = kzalloc(sizeof(struct comm), GFP_KERNEL);
+ cm = kzalloc(sizeof(struct dlm_comm), GFP_KERNEL);
if (!cm)
- return NULL;
+ return ERR_PTR(-ENOMEM);
config_item_init_type_name(&cm->item, name, &comm_type);
cm->nodeid = -1;
@@ -539,7 +549,7 @@ static struct config_item *make_comm(struct config_group *g, const char *name)
static void drop_comm(struct config_group *g, struct config_item *i)
{
- struct comm *cm = to_comm(i);
+ struct dlm_comm *cm = config_item_to_comm(i);
if (local_comm == cm)
local_comm = NULL;
dlm_lowcomms_close(cm->nodeid);
@@ -550,18 +560,18 @@ static void drop_comm(struct config_group *g, struct config_item *i)
static void release_comm(struct config_item *i)
{
- struct comm *cm = to_comm(i);
+ struct dlm_comm *cm = config_item_to_comm(i);
kfree(cm);
}
static struct config_item *make_node(struct config_group *g, const char *name)
{
- struct space *sp = to_space(g->cg_item.ci_parent);
- struct node *nd;
+ struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent);
+ struct dlm_node *nd;
- nd = kzalloc(sizeof(struct node), GFP_KERNEL);
+ nd = kzalloc(sizeof(struct dlm_node), GFP_KERNEL);
if (!nd)
- return NULL;
+ return ERR_PTR(-ENOMEM);
config_item_init_type_name(&nd->item, name, &node_type);
nd->nodeid = -1;
@@ -578,8 +588,8 @@ static struct config_item *make_node(struct config_group *g, const char *name)
static void drop_node(struct config_group *g, struct config_item *i)
{
- struct space *sp = to_space(g->cg_item.ci_parent);
- struct node *nd = to_node(i);
+ struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent);
+ struct dlm_node *nd = config_item_to_node(i);
mutex_lock(&sp->members_lock);
list_del(&nd->list);
@@ -591,11 +601,11 @@ static void drop_node(struct config_group *g, struct config_item *i)
static void release_node(struct config_item *i)
{
- struct node *nd = to_node(i);
+ struct dlm_node *nd = config_item_to_node(i);
kfree(nd);
}
-static struct clusters clusters_root = {
+static struct dlm_clusters clusters_root = {
.subsys = {
.su_group = {
.cg_item = {
@@ -625,7 +635,7 @@ void dlm_config_exit(void)
static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a,
char *buf)
{
- struct cluster *cl = to_cluster(i);
+ struct dlm_cluster *cl = config_item_to_cluster(i);
struct cluster_attribute *cla =
container_of(a, struct cluster_attribute, attr);
return cla->show ? cla->show(cl, buf) : 0;
@@ -635,7 +645,7 @@ static ssize_t store_cluster(struct config_item *i,
struct configfs_attribute *a,
const char *buf, size_t len)
{
- struct cluster *cl = to_cluster(i);
+ struct dlm_cluster *cl = config_item_to_cluster(i);
struct cluster_attribute *cla =
container_of(a, struct cluster_attribute, attr);
return cla->store ? cla->store(cl, buf, len) : -EINVAL;
@@ -644,7 +654,7 @@ static ssize_t store_cluster(struct config_item *i,
static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
char *buf)
{
- struct comm *cm = to_comm(i);
+ struct dlm_comm *cm = config_item_to_comm(i);
struct comm_attribute *cma =
container_of(a, struct comm_attribute, attr);
return cma->show ? cma->show(cm, buf) : 0;
@@ -653,29 +663,31 @@ static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
const char *buf, size_t len)
{
- struct comm *cm = to_comm(i);
+ struct dlm_comm *cm = config_item_to_comm(i);
struct comm_attribute *cma =
container_of(a, struct comm_attribute, attr);
return cma->store ? cma->store(cm, buf, len) : -EINVAL;
}
-static ssize_t comm_nodeid_read(struct comm *cm, char *buf)
+static ssize_t comm_nodeid_read(struct dlm_comm *cm, char *buf)
{
return sprintf(buf, "%d\n", cm->nodeid);
}
-static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len)
+static ssize_t comm_nodeid_write(struct dlm_comm *cm, const char *buf,
+ size_t len)
{
cm->nodeid = simple_strtol(buf, NULL, 0);
return len;
}
-static ssize_t comm_local_read(struct comm *cm, char *buf)
+static ssize_t comm_local_read(struct dlm_comm *cm, char *buf)
{
return sprintf(buf, "%d\n", cm->local);
}
-static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len)
+static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf,
+ size_t len)
{
cm->local= simple_strtol(buf, NULL, 0);
if (cm->local && !local_comm)
@@ -683,7 +695,7 @@ static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len)
return len;
}
-static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len)
+static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len)
{
struct sockaddr_storage *addr;
@@ -705,7 +717,7 @@ static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len)
static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
char *buf)
{
- struct node *nd = to_node(i);
+ struct dlm_node *nd = config_item_to_node(i);
struct node_attribute *nda =
container_of(a, struct node_attribute, attr);
return nda->show ? nda->show(nd, buf) : 0;
@@ -714,29 +726,31 @@ static ssize_t show_node(struct config_item *i, struct configfs_attribute *a,
static ssize_t store_node(struct config_item *i, struct configfs_attribute *a,
const char *buf, size_t len)
{
- struct node *nd = to_node(i);
+ struct dlm_node *nd = config_item_to_node(i);
struct node_attribute *nda =
container_of(a, struct node_attribute, attr);
return nda->store ? nda->store(nd, buf, len) : -EINVAL;
}
-static ssize_t node_nodeid_read(struct node *nd, char *buf)
+static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf)
{
return sprintf(buf, "%d\n", nd->nodeid);
}
-static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len)
+static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf,
+ size_t len)
{
nd->nodeid = simple_strtol(buf, NULL, 0);
return len;
}
-static ssize_t node_weight_read(struct node *nd, char *buf)
+static ssize_t node_weight_read(struct dlm_node *nd, char *buf)
{
return sprintf(buf, "%d\n", nd->weight);
}
-static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len)
+static ssize_t node_weight_write(struct dlm_node *nd, const char *buf,
+ size_t len)
{
nd->weight = simple_strtol(buf, NULL, 0);
return len;
@@ -746,7 +760,7 @@ static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len)
* Functions for the dlm to get the info that's been configured
*/
-static struct space *get_space(char *name)
+static struct dlm_space *get_space(char *name)
{
struct config_item *i;
@@ -757,18 +771,45 @@ static struct space *get_space(char *name)
i = config_group_find_item(space_list, name);
mutex_unlock(&space_list->cg_subsys->su_mutex);
- return to_space(i);
+ return config_item_to_space(i);
}
-static void put_space(struct space *sp)
+static void put_space(struct dlm_space *sp)
{
config_item_put(&sp->group.cg_item);
}
-static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
+static int addr_compare(struct sockaddr_storage *x, struct sockaddr_storage *y)
+{
+ switch (x->ss_family) {
+ case AF_INET: {
+ struct sockaddr_in *sinx = (struct sockaddr_in *)x;
+ struct sockaddr_in *siny = (struct sockaddr_in *)y;
+ if (sinx->sin_addr.s_addr != siny->sin_addr.s_addr)
+ return 0;
+ if (sinx->sin_port != siny->sin_port)
+ return 0;
+ break;
+ }
+ case AF_INET6: {
+ struct sockaddr_in6 *sinx = (struct sockaddr_in6 *)x;
+ struct sockaddr_in6 *siny = (struct sockaddr_in6 *)y;
+ if (!ipv6_addr_equal(&sinx->sin6_addr, &siny->sin6_addr))
+ return 0;
+ if (sinx->sin6_port != siny->sin6_port)
+ return 0;
+ break;
+ }
+ default:
+ return 0;
+ }
+ return 1;
+}
+
+static struct dlm_comm *get_comm(int nodeid, struct sockaddr_storage *addr)
{
struct config_item *i;
- struct comm *cm = NULL;
+ struct dlm_comm *cm = NULL;
int found = 0;
if (!comm_list)
@@ -777,7 +818,7 @@ static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
mutex_lock(&clusters_root.subsys.su_mutex);
list_for_each_entry(i, &comm_list->cg_children, ci_entry) {
- cm = to_comm(i);
+ cm = config_item_to_comm(i);
if (nodeid) {
if (cm->nodeid != nodeid)
@@ -786,8 +827,7 @@ static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
config_item_get(i);
break;
} else {
- if (!cm->addr_count ||
- memcmp(cm->addr[0], addr, sizeof(*addr)))
+ if (!cm->addr_count || !addr_compare(cm->addr[0], addr))
continue;
found = 1;
config_item_get(i);
@@ -801,7 +841,7 @@ static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
return cm;
}
-static void put_comm(struct comm *cm)
+static void put_comm(struct dlm_comm *cm)
{
config_item_put(&cm->item);
}
@@ -810,8 +850,8 @@ static void put_comm(struct comm *cm)
int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
int **new_out, int *new_count_out)
{
- struct space *sp;
- struct node *nd;
+ struct dlm_space *sp;
+ struct dlm_node *nd;
int i = 0, rv = 0, ids_count = 0, new_count = 0;
int *ids, *new;
@@ -874,8 +914,8 @@ int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
int dlm_node_weight(char *lsname, int nodeid)
{
- struct space *sp;
- struct node *nd;
+ struct dlm_space *sp;
+ struct dlm_node *nd;
int w = -EEXIST;
sp = get_space(lsname);
@@ -897,7 +937,7 @@ int dlm_node_weight(char *lsname, int nodeid)
int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr)
{
- struct comm *cm = get_comm(nodeid, NULL);
+ struct dlm_comm *cm = get_comm(nodeid, NULL);
if (!cm)
return -EEXIST;
if (!cm->addr_count)
@@ -909,7 +949,7 @@ int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr)
int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid)
{
- struct comm *cm = get_comm(0, addr);
+ struct dlm_comm *cm = get_comm(0, addr);
if (!cm)
return -EEXIST;
*nodeid = cm->nodeid;
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 5a7ac33b629..868e4c9ef12 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -2,7 +2,7 @@
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
-** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
+** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
@@ -441,8 +441,11 @@ struct dlm_ls {
uint32_t ls_global_id; /* global unique lockspace ID */
uint32_t ls_exflags;
int ls_lvblen;
- int ls_count; /* reference count */
+ int ls_count; /* refcount of processes in
+ the dlm using this ls */
+ int ls_create_count; /* create/release refcount */
unsigned long ls_flags; /* LSFL_ */
+ unsigned long ls_scan_time;
struct kobject ls_kobj;
struct dlm_rsbtable *ls_rsbtbl;
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 2d3d1027ce2..724ddac9153 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -363,6 +363,7 @@ static int search_rsb_list(struct list_head *head, char *name, int len,
if (len == r->res_length && !memcmp(name, r->res_name, len))
goto found;
}
+ *r_ret = NULL;
return -EBADR;
found:
@@ -1782,7 +1783,8 @@ static void grant_pending_locks(struct dlm_rsb *r)
list_for_each_entry_safe(lkb, s, &r->res_grantqueue, lkb_statequeue) {
if (lkb->lkb_bastfn && lock_requires_bast(lkb, high, cw)) {
- if (cw && high == DLM_LOCK_PR)
+ if (cw && high == DLM_LOCK_PR &&
+ lkb->lkb_grmode == DLM_LOCK_PR)
queue_bast(r, lkb, DLM_LOCK_CW);
else
queue_bast(r, lkb, high);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 499e16759e9..d910501de6d 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -2,7 +2,7 @@
*******************************************************************************
**
** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
-** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
+** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
**
** This copyrighted material is made available to anyone wishing to use,
** modify, copy, or redistribute it subject to the terms and conditions
@@ -23,6 +23,7 @@
#include "lock.h"
#include "recover.h"
#include "requestqueue.h"
+#include "user.h"
static int ls_count;
static struct mutex ls_lock;
@@ -211,19 +212,41 @@ void dlm_lockspace_exit(void)
kset_unregister(dlm_kset);
}
+static struct dlm_ls *find_ls_to_scan(void)
+{
+ struct dlm_ls *ls;
+
+ spin_lock(&lslist_lock);
+ list_for_each_entry(ls, &lslist, ls_list) {
+ if (time_after_eq(jiffies, ls->ls_scan_time +
+ dlm_config.ci_scan_secs * HZ)) {
+ spin_unlock(&lslist_lock);
+ return ls;
+ }
+ }
+ spin_unlock(&lslist_lock);
+ return NULL;
+}
+
static int dlm_scand(void *data)
{
struct dlm_ls *ls;
+ int timeout_jiffies = dlm_config.ci_scan_secs * HZ;
while (!kthread_should_stop()) {
- list_for_each_entry(ls, &lslist, ls_list) {
+ ls = find_ls_to_scan();
+ if (ls) {
if (dlm_lock_recovery_try(ls)) {
+ ls->ls_scan_time = jiffies;
dlm_scan_rsbs(ls);
dlm_scan_timeout(ls);
dlm_unlock_recovery(ls);
+ } else {
+ ls->ls_scan_time += HZ;
}
+ } else {
+ schedule_timeout_interruptible(timeout_jiffies);
}
- schedule_timeout_interruptible(dlm_config.ci_scan_secs * HZ);
}
return 0;
}
@@ -246,23 +269,6 @@ static void dlm_scand_stop(void)
kthread_stop(scand_task);
}
-static struct dlm_ls *dlm_find_lockspace_name(char *name, int namelen)
-{
- struct dlm_ls *ls;
-
- spin_lock(&lslist_lock);
-
- list_for_each_entry(ls, &lslist, ls_list) {
- if (ls->ls_namelen == namelen &&
- memcmp(ls->ls_name, name, namelen) == 0)
- goto out;
- }
- ls = NULL;
- out:
- spin_unlock(&lslist_lock);
- return ls;
-}
-
struct dlm_ls *dlm_find_lockspace_global(uint32_t id)
{
struct dlm_ls *ls;
@@ -327,6 +333,7 @@ static void remove_lockspace(struct dlm_ls *ls)
for (;;) {
spin_lock(&lslist_lock);
if (ls->ls_count == 0) {
+ WARN_ON(ls->ls_create_count != 0);
list_del(&ls->ls_list);
spin_unlock(&lslist_lock);
return;
@@ -381,7 +388,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
uint32_t flags, int lvblen)
{
struct dlm_ls *ls;
- int i, size, error = -ENOMEM;
+ int i, size, error;
int do_unreg = 0;
if (namelen > DLM_LOCKSPACE_LEN)
@@ -393,12 +400,37 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
if (!try_module_get(THIS_MODULE))
return -EINVAL;
- ls = dlm_find_lockspace_name(name, namelen);
- if (ls) {
- *lockspace = ls;
+ if (!dlm_user_daemon_available()) {
+ module_put(THIS_MODULE);
+ return -EUNATCH;
+ }
+
+ error = 0;
+
+ spin_lock(&lslist_lock);
+ list_for_each_entry(ls, &lslist, ls_list) {
+ WARN_ON(ls->ls_create_count <= 0);
+ if (ls->ls_namelen != namelen)
+ continue;
+ if (memcmp(ls->ls_name, name, namelen))
+ continue;
+ if (flags & DLM_LSFL_NEWEXCL) {
+ error = -EEXIST;
+ break;
+ }
+ ls->ls_create_count++;
module_put(THIS_MODULE);
- return -EEXIST;
+ error = 1; /* not an error, return 0 */
+ break;
}
+ spin_unlock(&lslist_lock);
+
+ if (error < 0)
+ goto out;
+ if (error)
+ goto ret_zero;
+
+ error = -ENOMEM;
ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL);
if (!ls)
@@ -408,6 +440,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
ls->ls_lvblen = lvblen;
ls->ls_count = 0;
ls->ls_flags = 0;
+ ls->ls_scan_time = jiffies;
if (flags & DLM_LSFL_TIMEWARN)
set_bit(LSFL_TIMEWARN, &ls->ls_flags);
@@ -418,8 +451,9 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
ls->ls_allocation = GFP_KERNEL;
/* ls_exflags are forced to match among nodes, and we don't
- need to require all nodes to have TIMEWARN or FS set */
- ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS));
+ need to require all nodes to have some flags set */
+ ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS |
+ DLM_LSFL_NEWEXCL));
size = dlm_config.ci_rsbtbl_size;
ls->ls_rsbtbl_size = size;
@@ -510,6 +544,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
down_write(&ls->ls_in_recovery);
spin_lock(&lslist_lock);
+ ls->ls_create_count = 1;
list_add(&ls->ls_list, &lslist);
spin_unlock(&lslist_lock);
@@ -548,7 +583,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
dlm_create_debug_file(ls);
log_debug(ls, "join complete");
-
+ ret_zero:
*lockspace = ls;
return 0;
@@ -635,13 +670,34 @@ static int release_lockspace(struct dlm_ls *ls, int force)
struct dlm_lkb *lkb;
struct dlm_rsb *rsb;
struct list_head *head;
- int i;
- int busy = lockspace_busy(ls);
+ int i, busy, rv;
+
+ busy = lockspace_busy(ls);
+
+ spin_lock(&lslist_lock);
+ if (ls->ls_create_count == 1) {
+ if (busy > force)
+ rv = -EBUSY;
+ else {
+ /* remove_lockspace takes ls off lslist */
+ ls->ls_create_count = 0;
+ rv = 0;
+ }
+ } else if (ls->ls_create_count > 1) {
+ rv = --ls->ls_create_count;
+ } else {
+ rv = -EINVAL;
+ }
+ spin_unlock(&lslist_lock);
- if (busy > force)
- return -EBUSY;
+ if (rv) {
+ log_debug(ls, "release_lockspace no remove %d", rv);
+ return rv;
+ }
+
+ dlm_device_deregister(ls);
- if (force < 3)
+ if (force < 3 && dlm_user_daemon_available())
do_uevent(ls, 0);
dlm_recoverd_stop(ls);
@@ -720,15 +776,10 @@ static int release_lockspace(struct dlm_ls *ls, int force)
dlm_clear_members(ls);
dlm_clear_members_gone(ls);
kfree(ls->ls_node_array);
+ log_debug(ls, "release_lockspace final free");
kobject_put(&ls->ls_kobj);
/* The ls structure will be freed when the kobject is done with */
- mutex_lock(&ls_lock);
- ls_count--;
- if (!ls_count)
- threads_stop();
- mutex_unlock(&ls_lock);
-
module_put(THIS_MODULE);
return 0;
}
@@ -750,11 +801,38 @@ static int release_lockspace(struct dlm_ls *ls, int force)
int dlm_release_lockspace(void *lockspace, int force)
{
struct dlm_ls *ls;
+ int error;
ls = dlm_find_lockspace_local(lockspace);
if (!ls)
return -EINVAL;
dlm_put_lockspace(ls);
- return release_lockspace(ls, force);
+
+ mutex_lock(&ls_lock);
+ error = release_lockspace(ls, force);
+ if (!error)
+ ls_count--;
+ else if (!ls_count)
+ threads_stop();
+ mutex_unlock(&ls_lock);
+
+ return error;
+}
+
+void dlm_stop_lockspaces(void)
+{
+ struct dlm_ls *ls;
+
+ restart:
+ spin_lock(&lslist_lock);
+ list_for_each_entry(ls, &lslist, ls_list) {
+ if (!test_bit(LSFL_RUNNING, &ls->ls_flags))
+ continue;
+ spin_unlock(&lslist_lock);
+ log_error(ls, "no userland control daemon, stopping lockspace");
+ dlm_ls_stop(ls);
+ goto restart;
+ }
+ spin_unlock(&lslist_lock);
}
diff --git a/fs/dlm/lockspace.h b/fs/dlm/lockspace.h
index 891eabbdd02..f879f87901f 100644
--- a/fs/dlm/lockspace.h
+++ b/fs/dlm/lockspace.h
@@ -20,6 +20,7 @@ struct dlm_ls *dlm_find_lockspace_global(uint32_t id);
struct dlm_ls *dlm_find_lockspace_local(void *id);
struct dlm_ls *dlm_find_lockspace_device(int minor);
void dlm_put_lockspace(struct dlm_ls *ls);
+void dlm_stop_lockspaces(void);
#endif /* __LOCKSPACE_DOT_H__ */
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 637018c891e..3962262f991 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -891,8 +891,10 @@ static void tcp_connect_to_sock(struct connection *con)
goto out_err;
memset(&saddr, 0, sizeof(saddr));
- if (dlm_nodeid_to_addr(con->nodeid, &saddr))
+ if (dlm_nodeid_to_addr(con->nodeid, &saddr)) {
+ sock_release(sock);
goto out_err;
+ }
sock->sk->sk_user_data = con;
con->rx_action = receive_from_sock;
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 78878c5781c..eba87ff3177 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -116,7 +116,7 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
if (xop->callback == NULL)
wait_event(recv_wq, (op->done != 0));
else {
- rv = -EINPROGRESS;
+ rv = FILE_LOCK_DEFERRED;
goto out;
}
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index ebbcf38fd33..b3832c67194 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2006-2007 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved.
*
* This copyrighted material is made available to anyone wishing to use,
* modify, copy, or redistribute it subject to the terms and conditions
@@ -26,6 +26,8 @@
static const char name_prefix[] = "dlm";
static const struct file_operations device_fops;
+static atomic_t dlm_monitor_opened;
+static int dlm_monitor_unused = 1;
#ifdef CONFIG_COMPAT
@@ -339,10 +341,15 @@ static int device_user_deadlock(struct dlm_user_proc *proc,
return error;
}
-static int create_misc_device(struct dlm_ls *ls, char *name)
+static int dlm_device_register(struct dlm_ls *ls, char *name)
{
int error, len;
+ /* The device is already registered. This happens when the
+ lockspace is created multiple times from userspace. */
+ if (ls->ls_device.name)
+ return 0;
+
error = -ENOMEM;
len = strlen(name) + strlen(name_prefix) + 2;
ls->ls_device.name = kzalloc(len, GFP_KERNEL);
@@ -362,6 +369,22 @@ fail:
return error;
}
+int dlm_device_deregister(struct dlm_ls *ls)
+{
+ int error;
+
+ /* The device is not registered. This happens when the lockspace
+ was never used from userspace, or when device_create_lockspace()
+ calls dlm_release_lockspace() after the register fails. */
+ if (!ls->ls_device.name)
+ return 0;
+
+ error = misc_deregister(&ls->ls_device);
+ if (!error)
+ kfree(ls->ls_device.name);
+ return error;
+}
+
static int device_user_purge(struct dlm_user_proc *proc,
struct dlm_purge_params *params)
{
@@ -396,7 +419,7 @@ static int device_create_lockspace(struct dlm_lspace_params *params)
if (!ls)
return -ENOENT;
- error = create_misc_device(ls, params->name);
+ error = dlm_device_register(ls, params->name);
dlm_put_lockspace(ls);
if (error)
@@ -420,31 +443,22 @@ static int device_remove_lockspace(struct dlm_lspace_params *params)
if (!ls)
return -ENOENT;
- /* Deregister the misc device first, so we don't have
- * a device that's not attached to a lockspace. If
- * dlm_release_lockspace fails then we can recreate it
- */
- error = misc_deregister(&ls->ls_device);
- if (error) {
- dlm_put_lockspace(ls);
- goto out;
- }
- kfree(ls->ls_device.name);
-
if (params->flags & DLM_USER_LSFLG_FORCEFREE)
force = 2;
lockspace = ls->ls_local_handle;
+ dlm_put_lockspace(ls);
- /* dlm_release_lockspace waits for references to go to zero,
- so all processes will need to close their device for the ls
- before the release will procede */
+ /* The final dlm_release_lockspace waits for references to go to
+ zero, so all processes will need to close their device for the
+ ls before the release will proceed. release also calls the
+ device_deregister above. Converting a positive return value
+ from release to zero means that userspace won't know when its
+ release was the final one, but it shouldn't need to know. */
- dlm_put_lockspace(ls);
error = dlm_release_lockspace(lockspace, force);
- if (error)
- create_misc_device(ls, ls->ls_name);
- out:
+ if (error > 0)
+ error = 0;
return error;
}
@@ -526,8 +540,10 @@ static ssize_t device_write(struct file *file, const char __user *buf,
k32buf = (struct dlm_write_request32 *)kbuf;
kbuf = kmalloc(count + 1 + (sizeof(struct dlm_write_request) -
sizeof(struct dlm_write_request32)), GFP_KERNEL);
- if (!kbuf)
+ if (!kbuf) {
+ kfree(k32buf);
return -ENOMEM;
+ }
if (proc)
set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags);
@@ -538,8 +554,10 @@ static ssize_t device_write(struct file *file, const char __user *buf,
/* do we really need this? can a write happen after a close? */
if ((kbuf->cmd == DLM_USER_LOCK || kbuf->cmd == DLM_USER_UNLOCK) &&
- test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))
- return -EINVAL;
+ (proc && test_bit(DLM_PROC_FLAGS_CLOSING, &proc->flags))) {
+ error = -EINVAL;
+ goto out_free;
+ }
sigfillset(&allsigs);
sigprocmask(SIG_BLOCK, &allsigs, &tmpsig);
@@ -868,6 +886,26 @@ static unsigned int device_poll(struct file *file, poll_table *wait)
return 0;
}
+int dlm_user_daemon_available(void)
+{
+ /* dlm_controld hasn't started (or, has started, but not
+ properly populated configfs) */
+
+ if (!dlm_our_nodeid())
+ return 0;
+
+ /* This is to deal with versions of dlm_controld that don't
+ know about the monitor device. We assume that if the
+ dlm_controld was started (above), but the monitor device
+ was never opened, that it's an old version. dlm_controld
+ should open the monitor device before populating configfs. */
+
+ if (dlm_monitor_unused)
+ return 1;
+
+ return atomic_read(&dlm_monitor_opened) ? 1 : 0;
+}
+
static int ctl_device_open(struct inode *inode, struct file *file)
{
file->private_data = NULL;
@@ -879,6 +917,20 @@ static int ctl_device_close(struct inode *inode, struct file *file)
return 0;
}
+static int monitor_device_open(struct inode *inode, struct file *file)
+{
+ atomic_inc(&dlm_monitor_opened);
+ dlm_monitor_unused = 0;
+ return 0;
+}
+
+static int monitor_device_close(struct inode *inode, struct file *file)
+{
+ if (atomic_dec_and_test(&dlm_monitor_opened))
+ dlm_stop_lockspaces();
+ return 0;
+}
+
static const struct file_operations device_fops = {
.open = device_open,
.release = device_close,
@@ -902,19 +954,42 @@ static struct miscdevice ctl_device = {
.minor = MISC_DYNAMIC_MINOR,
};
+static const struct file_operations monitor_device_fops = {
+ .open = monitor_device_open,
+ .release = monitor_device_close,
+ .owner = THIS_MODULE,
+};
+
+static struct miscdevice monitor_device = {
+ .name = "dlm-monitor",
+ .fops = &monitor_device_fops,
+ .minor = MISC_DYNAMIC_MINOR,
+};
+
int __init dlm_user_init(void)
{
int error;
+ atomic_set(&dlm_monitor_opened, 0);
+
error = misc_register(&ctl_device);
- if (error)
+ if (error) {
log_print("misc_register failed for control device");
+ goto out;
+ }
+ error = misc_register(&monitor_device);
+ if (error) {
+ log_print("misc_register failed for monitor device");
+ misc_deregister(&ctl_device);
+ }
+ out:
return error;
}
void dlm_user_exit(void)
{
misc_deregister(&ctl_device);
+ misc_deregister(&monitor_device);
}
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
index d38e9f3e415..35eb6a13d61 100644
--- a/fs/dlm/user.h
+++ b/fs/dlm/user.h
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2006 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved.
*
* This copyrighted material is made available to anyone wishing to use,
* modify, copy, or redistribute it subject to the terms and conditions
@@ -12,5 +12,7 @@
void dlm_user_add_ast(struct dlm_lkb *lkb, int type);
int dlm_user_init(void);
void dlm_user_exit(void);
+int dlm_device_deregister(struct dlm_ls *ls);
+int dlm_user_daemon_available(void);
#endif
diff --git a/fs/dquot.c b/fs/dquot.c
index 5ac77da1995..ad7e59003e0 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -562,6 +562,8 @@ static struct shrinker dqcache_shrinker = {
*/
static void dqput(struct dquot *dquot)
{
+ int ret;
+
if (!dquot)
return;
#ifdef __DQUOT_PARANOIA
@@ -594,7 +596,19 @@ we_slept:
if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && dquot_dirty(dquot)) {
spin_unlock(&dq_list_lock);
/* Commit dquot before releasing */
- dquot->dq_sb->dq_op->write_dquot(dquot);
+ ret = dquot->dq_sb->dq_op->write_dquot(dquot);
+ if (ret < 0) {
+ printk(KERN_ERR "VFS: cannot write quota structure on "
+ "device %s (error %d). Quota may get out of "
+ "sync!\n", dquot->dq_sb->s_id, ret);
+ /*
+ * We clear dirty bit anyway, so that we avoid
+ * infinite loop here
+ */
+ spin_lock(&dq_list_lock);
+ clear_dquot_dirty(dquot);
+ spin_unlock(&dq_list_lock);
+ }
goto we_slept;
}
/* Clear flag in case dquot was inactive (something bad happened) */
@@ -875,13 +889,15 @@ static void print_warning(struct dquot *dquot, const int warntype)
char *msg = NULL;
struct tty_struct *tty;
- if (!need_print_warning(dquot))
+ if (warntype == QUOTA_NL_IHARDBELOW ||
+ warntype == QUOTA_NL_ISOFTBELOW ||
+ warntype == QUOTA_NL_BHARDBELOW ||
+ warntype == QUOTA_NL_BSOFTBELOW || !need_print_warning(dquot))
return;
- mutex_lock(&tty_mutex);
tty = get_current_tty();
if (!tty)
- goto out_lock;
+ return;
tty_write_message(tty, dquot->dq_sb->s_id);
if (warntype == QUOTA_NL_ISOFTWARN || warntype == QUOTA_NL_BSOFTWARN)
tty_write_message(tty, ": warning, ");
@@ -909,8 +925,7 @@ static void print_warning(struct dquot *dquot, const int warntype)
break;
}
tty_write_message(tty, msg);
-out_lock:
- mutex_unlock(&tty_mutex);
+ tty_kref_put(tty);
}
#endif
@@ -1083,6 +1098,35 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
return QUOTA_OK;
}
+static int info_idq_free(struct dquot *dquot, ulong inodes)
+{
+ if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
+ dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit)
+ return QUOTA_NL_NOWARN;
+
+ if (dquot->dq_dqb.dqb_curinodes - inodes <= dquot->dq_dqb.dqb_isoftlimit)
+ return QUOTA_NL_ISOFTBELOW;
+ if (dquot->dq_dqb.dqb_curinodes >= dquot->dq_dqb.dqb_ihardlimit &&
+ dquot->dq_dqb.dqb_curinodes - inodes < dquot->dq_dqb.dqb_ihardlimit)
+ return QUOTA_NL_IHARDBELOW;
+ return QUOTA_NL_NOWARN;
+}
+
+static int info_bdq_free(struct dquot *dquot, qsize_t space)
+{
+ if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
+ toqb(dquot->dq_dqb.dqb_curspace) <= dquot->dq_dqb.dqb_bsoftlimit)
+ return QUOTA_NL_NOWARN;
+
+ if (toqb(dquot->dq_dqb.dqb_curspace - space) <=
+ dquot->dq_dqb.dqb_bsoftlimit)
+ return QUOTA_NL_BSOFTBELOW;
+ if (toqb(dquot->dq_dqb.dqb_curspace) >= dquot->dq_dqb.dqb_bhardlimit &&
+ toqb(dquot->dq_dqb.dqb_curspace - space) <
+ dquot->dq_dqb.dqb_bhardlimit)
+ return QUOTA_NL_BHARDBELOW;
+ return QUOTA_NL_NOWARN;
+}
/*
* Initialize quota pointers in inode
* Transaction must be started at entry
@@ -1139,6 +1183,28 @@ int dquot_drop(struct inode *inode)
return 0;
}
+/* Wrapper to remove references to quota structures from inode */
+void vfs_dq_drop(struct inode *inode)
+{
+ /* Here we can get arbitrary inode from clear_inode() so we have
+ * to be careful. OTOH we don't need locking as quota operations
+ * are allowed to change only at mount time */
+ if (!IS_NOQUOTA(inode) && inode->i_sb && inode->i_sb->dq_op
+ && inode->i_sb->dq_op->drop) {
+ int cnt;
+ /* Test before calling to rule out calls from proc and such
+ * where we are not allowed to block. Note that this is
+ * actually reliable test even without the lock - the caller
+ * must assure that nobody can come after the DQUOT_DROP and
+ * add quota pointers back anyway */
+ for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+ if (inode->i_dquot[cnt] != NODQUOT)
+ break;
+ if (cnt < MAXQUOTAS)
+ inode->i_sb->dq_op->drop(inode);
+ }
+}
+
/*
* Following four functions update i_blocks+i_bytes fields and
* quota information (together with appropriate checks)
@@ -1248,6 +1314,7 @@ warn_put_all:
int dquot_free_space(struct inode *inode, qsize_t number)
{
unsigned int cnt;
+ char warntype[MAXQUOTAS];
/* First test before acquiring mutex - solves deadlocks when we
* re-enter the quota code and are already holding the mutex */
@@ -1256,6 +1323,7 @@ out_sub:
inode_sub_bytes(inode, number);
return QUOTA_OK;
}
+
down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
/* Now recheck reliably when holding dqptr_sem */
if (IS_NOQUOTA(inode)) {
@@ -1266,6 +1334,7 @@ out_sub:
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
if (inode->i_dquot[cnt] == NODQUOT)
continue;
+ warntype[cnt] = info_bdq_free(inode->i_dquot[cnt], number);
dquot_decr_space(inode->i_dquot[cnt], number);
}
inode_sub_bytes(inode, number);
@@ -1274,6 +1343,7 @@ out_sub:
for (cnt = 0; cnt < MAXQUOTAS; cnt++)
if (inode->i_dquot[cnt])
mark_dquot_dirty(inode->i_dquot[cnt]);
+ flush_warnings(inode->i_dquot, warntype);
up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
return QUOTA_OK;
}
@@ -1284,11 +1354,13 @@ out_sub:
int dquot_free_inode(const struct inode *inode, unsigned long number)
{
unsigned int cnt;
+ char warntype[MAXQUOTAS];
/* First test before acquiring mutex - solves deadlocks when we
* re-enter the quota code and are already holding the mutex */
if (IS_NOQUOTA(inode))
return QUOTA_OK;
+
down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
/* Now recheck reliably when holding dqptr_sem */
if (IS_NOQUOTA(inode)) {
@@ -1299,6 +1371,7 @@ int dquot_free_inode(const struct inode *inode, unsigned long number)
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
if (inode->i_dquot[cnt] == NODQUOT)
continue;
+ warntype[cnt] = info_idq_free(inode->i_dquot[cnt], number);
dquot_decr_inodes(inode->i_dquot[cnt], number);
}
spin_unlock(&dq_data_lock);
@@ -1306,6 +1379,7 @@ int dquot_free_inode(const struct inode *inode, unsigned long number)
for (cnt = 0; cnt < MAXQUOTAS; cnt++)
if (inode->i_dquot[cnt])
mark_dquot_dirty(inode->i_dquot[cnt]);
+ flush_warnings(inode->i_dquot, warntype);
up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
return QUOTA_OK;
}
@@ -1323,7 +1397,8 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
struct dquot *transfer_to[MAXQUOTAS];
int cnt, ret = NO_QUOTA, chuid = (iattr->ia_valid & ATTR_UID) && inode->i_uid != iattr->ia_uid,
chgid = (iattr->ia_valid & ATTR_GID) && inode->i_gid != iattr->ia_gid;
- char warntype[MAXQUOTAS];
+ char warntype_to[MAXQUOTAS];
+ char warntype_from_inodes[MAXQUOTAS], warntype_from_space[MAXQUOTAS];
/* First test before acquiring mutex - solves deadlocks when we
* re-enter the quota code and are already holding the mutex */
@@ -1332,7 +1407,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
/* Clear the arrays */
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
transfer_to[cnt] = transfer_from[cnt] = NODQUOT;
- warntype[cnt] = QUOTA_NL_NOWARN;
+ warntype_to[cnt] = QUOTA_NL_NOWARN;
}
down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
/* Now recheck reliably when holding dqptr_sem */
@@ -1364,8 +1439,9 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
if (transfer_to[cnt] == NODQUOT)
continue;
transfer_from[cnt] = inode->i_dquot[cnt];
- if (check_idq(transfer_to[cnt], 1, warntype+cnt) == NO_QUOTA ||
- check_bdq(transfer_to[cnt], space, 0, warntype+cnt) == NO_QUOTA)
+ if (check_idq(transfer_to[cnt], 1, warntype_to + cnt) ==
+ NO_QUOTA || check_bdq(transfer_to[cnt], space, 0,
+ warntype_to + cnt) == NO_QUOTA)
goto warn_put_all;
}
@@ -1381,6 +1457,10 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
/* Due to IO error we might not have transfer_from[] structure */
if (transfer_from[cnt]) {
+ warntype_from_inodes[cnt] =
+ info_idq_free(transfer_from[cnt], 1);
+ warntype_from_space[cnt] =
+ info_bdq_free(transfer_from[cnt], space);
dquot_decr_inodes(transfer_from[cnt], 1);
dquot_decr_space(transfer_from[cnt], space);
}
@@ -1400,7 +1480,9 @@ warn_put_all:
if (transfer_to[cnt])
mark_dquot_dirty(transfer_to[cnt]);
}
- flush_warnings(transfer_to, warntype);
+ flush_warnings(transfer_to, warntype_to);
+ flush_warnings(transfer_from, warntype_from_inodes);
+ flush_warnings(transfer_from, warntype_from_space);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
if (ret == QUOTA_OK && transfer_from[cnt] != NODQUOT)
@@ -1412,6 +1494,18 @@ warn_put_all:
return ret;
}
+/* Wrapper for transferring ownership of an inode */
+int vfs_dq_transfer(struct inode *inode, struct iattr *iattr)
+{
+ if (sb_any_quota_enabled(inode->i_sb) && !IS_NOQUOTA(inode)) {
+ vfs_dq_init(inode);
+ if (inode->i_sb->dq_op->transfer(inode, iattr) == NO_QUOTA)
+ return 1;
+ }
+ return 0;
+}
+
+
/*
* Write info of quota file to disk
*/
@@ -1697,6 +1791,21 @@ static int vfs_quota_on_remount(struct super_block *sb, int type)
return ret;
}
+int vfs_quota_on_path(struct super_block *sb, int type, int format_id,
+ struct path *path)
+{
+ int error = security_quota_on(path->dentry);
+ if (error)
+ return error;
+ /* Quota file not on the same filesystem? */
+ if (path->mnt->mnt_sb != sb)
+ error = -EXDEV;
+ else
+ error = vfs_quota_on_inode(path->dentry->d_inode, type,
+ format_id);
+ return error;
+}
+
/* Actual function called from quotactl() */
int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path,
int remount)
@@ -1708,19 +1817,10 @@ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path,
return vfs_quota_on_remount(sb, type);
error = path_lookup(path, LOOKUP_FOLLOW, &nd);
- if (error < 0)
- return error;
- error = security_quota_on(nd.path.dentry);
- if (error)
- goto out_path;
- /* Quota file not on the same filesystem? */
- if (nd.path.mnt->mnt_sb != sb)
- error = -EXDEV;
- else
- error = vfs_quota_on_inode(nd.path.dentry->d_inode, type,
- format_id);
-out_path:
- path_put(&nd.path);
+ if (!error) {
+ error = vfs_quota_on_path(sb, type, format_id, &nd.path);
+ path_put(&nd.path);
+ }
return error;
}
@@ -1752,6 +1852,22 @@ out:
return error;
}
+/* Wrapper to turn on quotas when remounting rw */
+int vfs_dq_quota_on_remount(struct super_block *sb)
+{
+ int cnt;
+ int ret = 0, err;
+
+ if (!sb->s_qcop || !sb->s_qcop->quota_on)
+ return -ENOSYS;
+ for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+ err = sb->s_qcop->quota_on(sb, cnt, 0, NULL, 1);
+ if (err < 0 && !ret)
+ ret = err;
+ }
+ return ret;
+}
+
/* Generic routine for getting common part of quota structure */
static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di)
{
@@ -2073,6 +2189,7 @@ EXPORT_SYMBOL(unregister_quota_format);
EXPORT_SYMBOL(dqstats);
EXPORT_SYMBOL(dq_data_lock);
EXPORT_SYMBOL(vfs_quota_on);
+EXPORT_SYMBOL(vfs_quota_on_path);
EXPORT_SYMBOL(vfs_quota_on_mount);
EXPORT_SYMBOL(vfs_quota_off);
EXPORT_SYMBOL(vfs_quota_sync);
@@ -2087,8 +2204,11 @@ EXPORT_SYMBOL(dquot_release);
EXPORT_SYMBOL(dquot_mark_dquot_dirty);
EXPORT_SYMBOL(dquot_initialize);
EXPORT_SYMBOL(dquot_drop);
+EXPORT_SYMBOL(vfs_dq_drop);
EXPORT_SYMBOL(dquot_alloc_space);
EXPORT_SYMBOL(dquot_alloc_inode);
EXPORT_SYMBOL(dquot_free_space);
EXPORT_SYMBOL(dquot_free_inode);
EXPORT_SYMBOL(dquot_transfer);
+EXPORT_SYMBOL(vfs_dq_transfer);
+EXPORT_SYMBOL(vfs_dq_quota_on_remount);
diff --git a/fs/ecryptfs/Makefile b/fs/ecryptfs/Makefile
index 1e34a7fd488..b4755a85996 100644
--- a/fs/ecryptfs/Makefile
+++ b/fs/ecryptfs/Makefile
@@ -4,4 +4,4 @@
obj-$(CONFIG_ECRYPT_FS) += ecryptfs.o
-ecryptfs-objs := dentry.o file.o inode.o main.o super.o mmap.o read_write.o crypto.o keystore.o messaging.o netlink.o miscdev.o debug.o
+ecryptfs-objs := dentry.o file.o inode.o main.o super.o mmap.o read_write.o crypto.o keystore.o messaging.o netlink.o miscdev.o kthread.o debug.o
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index e2832bc7869..06db79d05c1 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -33,6 +33,7 @@
#include <linux/crypto.h>
#include <linux/file.h>
#include <linux/scatterlist.h>
+#include <asm/unaligned.h>
#include "ecryptfs_kernel.h"
static int
@@ -474,8 +475,8 @@ int ecryptfs_encrypt_page(struct page *page)
{
struct inode *ecryptfs_inode;
struct ecryptfs_crypt_stat *crypt_stat;
- char *enc_extent_virt = NULL;
- struct page *enc_extent_page;
+ char *enc_extent_virt;
+ struct page *enc_extent_page = NULL;
loff_t extent_offset;
int rc = 0;
@@ -491,14 +492,14 @@ int ecryptfs_encrypt_page(struct page *page)
page->index);
goto out;
}
- enc_extent_virt = kmalloc(PAGE_CACHE_SIZE, GFP_USER);
- if (!enc_extent_virt) {
+ enc_extent_page = alloc_page(GFP_USER);
+ if (!enc_extent_page) {
rc = -ENOMEM;
ecryptfs_printk(KERN_ERR, "Error allocating memory for "
"encrypted extent\n");
goto out;
}
- enc_extent_page = virt_to_page(enc_extent_virt);
+ enc_extent_virt = kmap(enc_extent_page);
for (extent_offset = 0;
extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size);
extent_offset++) {
@@ -526,7 +527,10 @@ int ecryptfs_encrypt_page(struct page *page)
}
}
out:
- kfree(enc_extent_virt);
+ if (enc_extent_page) {
+ kunmap(enc_extent_page);
+ __free_page(enc_extent_page);
+ }
return rc;
}
@@ -608,8 +612,8 @@ int ecryptfs_decrypt_page(struct page *page)
{
struct inode *ecryptfs_inode;
struct ecryptfs_crypt_stat *crypt_stat;
- char *enc_extent_virt = NULL;
- struct page *enc_extent_page;
+ char *enc_extent_virt;
+ struct page *enc_extent_page = NULL;
unsigned long extent_offset;
int rc = 0;
@@ -626,14 +630,14 @@ int ecryptfs_decrypt_page(struct page *page)
page->index);
goto out;
}
- enc_extent_virt = kmalloc(PAGE_CACHE_SIZE, GFP_USER);
- if (!enc_extent_virt) {
+ enc_extent_page = alloc_page(GFP_USER);
+ if (!enc_extent_page) {
rc = -ENOMEM;
ecryptfs_printk(KERN_ERR, "Error allocating memory for "
"encrypted extent\n");
goto out;
}
- enc_extent_page = virt_to_page(enc_extent_virt);
+ enc_extent_virt = kmap(enc_extent_page);
for (extent_offset = 0;
extent_offset < (PAGE_CACHE_SIZE / crypt_stat->extent_size);
extent_offset++) {
@@ -661,7 +665,10 @@ int ecryptfs_decrypt_page(struct page *page)
}
}
out:
- kfree(enc_extent_virt);
+ if (enc_extent_page) {
+ kunmap(enc_extent_page);
+ __free_page(enc_extent_page);
+ }
return rc;
}
@@ -1032,10 +1039,8 @@ static int contains_ecryptfs_marker(char *data)
{
u32 m_1, m_2;
- memcpy(&m_1, data, 4);
- m_1 = be32_to_cpu(m_1);
- memcpy(&m_2, (data + 4), 4);
- m_2 = be32_to_cpu(m_2);
+ m_1 = get_unaligned_be32(data);
+ m_2 = get_unaligned_be32(data + 4);
if ((m_1 ^ MAGIC_ECRYPTFS_MARKER) == m_2)
return 1;
ecryptfs_printk(KERN_DEBUG, "m_1 = [0x%.8x]; m_2 = [0x%.8x]; "
@@ -1073,8 +1078,7 @@ static int ecryptfs_process_flags(struct ecryptfs_crypt_stat *crypt_stat,
int i;
u32 flags;
- memcpy(&flags, page_virt, 4);
- flags = be32_to_cpu(flags);
+ flags = get_unaligned_be32(page_virt);
for (i = 0; i < ((sizeof(ecryptfs_flag_map)
/ sizeof(struct ecryptfs_flag_map_elem))); i++)
if (flags & ecryptfs_flag_map[i].file_flag) {
@@ -1100,11 +1104,9 @@ static void write_ecryptfs_marker(char *page_virt, size_t *written)
get_random_bytes(&m_1, (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2));
m_2 = (m_1 ^ MAGIC_ECRYPTFS_MARKER);
- m_1 = cpu_to_be32(m_1);
- memcpy(page_virt, &m_1, (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2));
- m_2 = cpu_to_be32(m_2);
- memcpy(page_virt + (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2), &m_2,
- (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2));
+ put_unaligned_be32(m_1, page_virt);
+ page_virt += (MAGIC_ECRYPTFS_MARKER_SIZE_BYTES / 2);
+ put_unaligned_be32(m_2, page_virt);
(*written) = MAGIC_ECRYPTFS_MARKER_SIZE_BYTES;
}
@@ -1121,8 +1123,7 @@ write_ecryptfs_flags(char *page_virt, struct ecryptfs_crypt_stat *crypt_stat,
flags |= ecryptfs_flag_map[i].file_flag;
/* Version is in top 8 bits of the 32-bit flag vector */
flags |= ((((u8)crypt_stat->file_version) << 24) & 0xFF000000);
- flags = cpu_to_be32(flags);
- memcpy(page_virt, &flags, 4);
+ put_unaligned_be32(flags, page_virt);
(*written) = 4;
}
@@ -1238,11 +1239,9 @@ ecryptfs_write_header_metadata(char *virt,
num_header_extents_at_front =
(u16)(crypt_stat->num_header_bytes_at_front
/ crypt_stat->extent_size);
- header_extent_size = cpu_to_be32(header_extent_size);
- memcpy(virt, &header_extent_size, 4);
+ put_unaligned_be32(header_extent_size, virt);
virt += 4;
- num_header_extents_at_front = cpu_to_be16(num_header_extents_at_front);
- memcpy(virt, &num_header_extents_at_front, 2);
+ put_unaligned_be16(num_header_extents_at_front, virt);
(*written) = 6;
}
@@ -1410,15 +1409,13 @@ static int parse_header_metadata(struct ecryptfs_crypt_stat *crypt_stat,
u32 header_extent_size;
u16 num_header_extents_at_front;
- memcpy(&header_extent_size, virt, sizeof(u32));
- header_extent_size = be32_to_cpu(header_extent_size);
- virt += sizeof(u32);
- memcpy(&num_header_extents_at_front, virt, sizeof(u16));
- num_header_extents_at_front = be16_to_cpu(num_header_extents_at_front);
+ header_extent_size = get_unaligned_be32(virt);
+ virt += sizeof(__be32);
+ num_header_extents_at_front = get_unaligned_be16(virt);
crypt_stat->num_header_bytes_at_front =
(((size_t)num_header_extents_at_front
* (size_t)header_extent_size));
- (*bytes_read) = (sizeof(u32) + sizeof(u16));
+ (*bytes_read) = (sizeof(__be32) + sizeof(__be16));
if ((validate_header_size == ECRYPTFS_VALIDATE_HEADER_SIZE)
&& (crypt_stat->num_header_bytes_at_front
< ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)) {
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index c15c25745e0..b73fb752c5f 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -559,10 +559,25 @@ extern struct kmem_cache *ecryptfs_key_record_cache;
extern struct kmem_cache *ecryptfs_key_sig_cache;
extern struct kmem_cache *ecryptfs_global_auth_tok_cache;
extern struct kmem_cache *ecryptfs_key_tfm_cache;
+extern struct kmem_cache *ecryptfs_open_req_cache;
+struct ecryptfs_open_req {
+#define ECRYPTFS_REQ_PROCESSED 0x00000001
+#define ECRYPTFS_REQ_DROPPED 0x00000002
+#define ECRYPTFS_REQ_ZOMBIE 0x00000004
+ u32 flags;
+ struct file **lower_file;
+ struct dentry *lower_dentry;
+ struct vfsmount *lower_mnt;
+ wait_queue_head_t wait;
+ struct mutex mux;
+ struct list_head kthread_ctl_list;
+};
+
+#define ECRYPTFS_INTERPOSE_FLAG_D_ADD 0x00000001
int ecryptfs_interpose(struct dentry *hidden_dentry,
struct dentry *this_dentry, struct super_block *sb,
- int flag);
+ u32 flags);
int ecryptfs_fill_zeros(struct file *file, loff_t new_length);
int ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat,
const char *name, int length,
@@ -690,5 +705,11 @@ void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx);
int
ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid,
struct user_namespace *user_ns, struct pid *pid);
+int ecryptfs_init_kthread(void);
+void ecryptfs_destroy_kthread(void);
+int ecryptfs_privileged_open(struct file **lower_file,
+ struct dentry *lower_dentry,
+ struct vfsmount *lower_mnt);
+int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry);
#endif /* #ifndef ECRYPTFS_KERNEL_H */
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 2258b8f654a..9244d653743 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -30,6 +30,7 @@
#include <linux/security.h>
#include <linux/compat.h>
#include <linux/fs_stack.h>
+#include <linux/smp_lock.h>
#include "ecryptfs_kernel.h"
/**
@@ -191,6 +192,23 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
| ECRYPTFS_ENCRYPTED);
}
mutex_unlock(&crypt_stat->cs_mutex);
+ if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY)
+ && !(file->f_flags & O_RDONLY)) {
+ rc = -EPERM;
+ printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs "
+ "file must hence be opened RO\n", __func__);
+ goto out;
+ }
+ if (!ecryptfs_inode_to_private(inode)->lower_file) {
+ rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
+ if (rc) {
+ printk(KERN_ERR "%s: Error attempting to initialize "
+ "the persistent file for the dentry with name "
+ "[%s]; rc = [%d]\n", __func__,
+ ecryptfs_dentry->d_name.name, rc);
+ goto out;
+ }
+ }
ecryptfs_set_file_lower(
file, ecryptfs_inode_to_private(inode)->lower_file);
if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) {
@@ -277,9 +295,11 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag)
int rc = 0;
struct file *lower_file = NULL;
+ lock_kernel();
lower_file = ecryptfs_file_to_lower(file);
if (lower_file->f_op && lower_file->f_op->fasync)
rc = lower_file->f_op->fasync(fd, lower_file, flag);
+ unlock_kernel();
return rc;
}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index c92cc1c00aa..89209f00f9c 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -31,6 +31,7 @@
#include <linux/mount.h>
#include <linux/crypto.h>
#include <linux/fs_stack.h>
+#include <asm/unaligned.h>
#include "ecryptfs_kernel.h"
static struct dentry *lock_parent(struct dentry *dentry)
@@ -188,6 +189,16 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry)
"context; rc = [%d]\n", rc);
goto out;
}
+ if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) {
+ rc = ecryptfs_init_persistent_file(ecryptfs_dentry);
+ if (rc) {
+ printk(KERN_ERR "%s: Error attempting to initialize "
+ "the persistent file for the dentry with name "
+ "[%s]; rc = [%d]\n", __func__,
+ ecryptfs_dentry->d_name.name, rc);
+ goto out;
+ }
+ }
rc = ecryptfs_write_metadata(ecryptfs_dentry);
if (rc) {
printk(KERN_ERR "Error writing headers; rc = [%d]\n", rc);
@@ -307,10 +318,11 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry,
d_add(dentry, NULL);
goto out;
}
- rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, 1);
+ rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb,
+ ECRYPTFS_INTERPOSE_FLAG_D_ADD);
if (rc) {
ecryptfs_printk(KERN_ERR, "Error interposing\n");
- goto out_dput;
+ goto out;
}
if (S_ISDIR(lower_inode->i_mode)) {
ecryptfs_printk(KERN_DEBUG, "Is a directory; returning\n");
@@ -336,11 +348,21 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry,
rc = -ENOMEM;
ecryptfs_printk(KERN_ERR,
"Cannot ecryptfs_kmalloc a page\n");
- goto out_dput;
+ goto out;
}
crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
ecryptfs_set_default_sizes(crypt_stat);
+ if (!ecryptfs_inode_to_private(dentry->d_inode)->lower_file) {
+ rc = ecryptfs_init_persistent_file(dentry);
+ if (rc) {
+ printk(KERN_ERR "%s: Error attempting to initialize "
+ "the persistent file for the dentry with name "
+ "[%s]; rc = [%d]\n", __func__,
+ dentry->d_name.name, rc);
+ goto out;
+ }
+ }
rc = ecryptfs_read_and_validate_header_region(page_virt,
dentry->d_inode);
if (rc) {
@@ -364,8 +386,7 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry,
else
file_size = i_size_read(lower_dentry->d_inode);
} else {
- memcpy(&file_size, page_virt, sizeof(file_size));
- file_size = be64_to_cpu(file_size);
+ file_size = get_unaligned_be64(page_virt);
}
i_size_write(dentry->d_inode, (loff_t)file_size);
kmem_cache_free(ecryptfs_header_cache_2, page_virt);
@@ -444,7 +465,6 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
int rc;
struct dentry *lower_dentry;
struct dentry *lower_dir_dentry;
- umode_t mode;
char *encoded_symname;
int encoded_symlen;
struct ecryptfs_crypt_stat *crypt_stat = NULL;
@@ -452,7 +472,6 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
lower_dentry = ecryptfs_dentry_to_lower(dentry);
dget(lower_dentry);
lower_dir_dentry = lock_parent(lower_dentry);
- mode = S_IALLUGO;
encoded_symlen = ecryptfs_encode_filename(crypt_stat, symname,
strlen(symname),
&encoded_symname);
@@ -461,7 +480,7 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
goto out_lock;
}
rc = vfs_symlink(lower_dir_dentry->d_inode, lower_dentry,
- encoded_symname, mode);
+ encoded_symname);
kfree(encoded_symname);
if (rc || !lower_dentry->d_inode)
goto out_lock;
@@ -809,22 +828,9 @@ out:
}
static int
-ecryptfs_permission(struct inode *inode, int mask, struct nameidata *nd)
+ecryptfs_permission(struct inode *inode, int mask)
{
- int rc;
-
- if (nd) {
- struct vfsmount *vfsmnt_save = nd->path.mnt;
- struct dentry *dentry_save = nd->path.dentry;
-
- nd->path.mnt = ecryptfs_dentry_to_lower_mnt(nd->path.dentry);
- nd->path.dentry = ecryptfs_dentry_to_lower(nd->path.dentry);
- rc = permission(ecryptfs_inode_to_lower(inode), mask, nd);
- nd->path.mnt = vfsmnt_save;
- nd->path.dentry = dentry_save;
- } else
- rc = permission(ecryptfs_inode_to_lower(inode), mask, NULL);
- return rc;
+ return inode_permission(ecryptfs_inode_to_lower(inode), mask);
}
/**
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index e82b457180b..f5b76a331b9 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -44,15 +44,15 @@ static int process_request_key_err(long err_code)
int rc = 0;
switch (err_code) {
- case ENOKEY:
+ case -ENOKEY:
ecryptfs_printk(KERN_WARNING, "No key\n");
rc = -ENOENT;
break;
- case EKEYEXPIRED:
+ case -EKEYEXPIRED:
ecryptfs_printk(KERN_WARNING, "Key expired\n");
rc = -ETIME;
break;
- case EKEYREVOKED:
+ case -EKEYREVOKED:
ecryptfs_printk(KERN_WARNING, "Key revoked\n");
rc = -EINVAL;
break;
@@ -963,8 +963,7 @@ int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key,
if (!(*auth_tok_key) || IS_ERR(*auth_tok_key)) {
printk(KERN_ERR "Could not find key with description: [%s]\n",
sig);
- process_request_key_err(PTR_ERR(*auth_tok_key));
- rc = -EINVAL;
+ rc = process_request_key_err(PTR_ERR(*auth_tok_key));
goto out;
}
(*auth_tok) = ecryptfs_get_key_payload_data(*auth_tok_key);
diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
new file mode 100644
index 00000000000..c440c6b58b2
--- /dev/null
+++ b/fs/ecryptfs/kthread.c
@@ -0,0 +1,203 @@
+/**
+ * eCryptfs: Linux filesystem encryption layer
+ *
+ * Copyright (C) 2008 International Business Machines Corp.
+ * Author(s): Michael A. Halcrow <mahalcro@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/wait.h>
+#include <linux/mount.h>
+#include "ecryptfs_kernel.h"
+
+struct kmem_cache *ecryptfs_open_req_cache;
+
+static struct ecryptfs_kthread_ctl {
+#define ECRYPTFS_KTHREAD_ZOMBIE 0x00000001
+ u32 flags;
+ struct mutex mux;
+ struct list_head req_list;
+ wait_queue_head_t wait;
+} ecryptfs_kthread_ctl;
+
+static struct task_struct *ecryptfs_kthread;
+
+/**
+ * ecryptfs_threadfn
+ * @ignored: ignored
+ *
+ * The eCryptfs kernel thread that has the responsibility of getting
+ * the lower persistent file with RW permissions.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+static int ecryptfs_threadfn(void *ignored)
+{
+ set_freezable();
+ while (1) {
+ struct ecryptfs_open_req *req;
+
+ wait_event_freezable(
+ ecryptfs_kthread_ctl.wait,
+ (!list_empty(&ecryptfs_kthread_ctl.req_list)
+ || kthread_should_stop()));
+ mutex_lock(&ecryptfs_kthread_ctl.mux);
+ if (ecryptfs_kthread_ctl.flags & ECRYPTFS_KTHREAD_ZOMBIE) {
+ mutex_unlock(&ecryptfs_kthread_ctl.mux);
+ goto out;
+ }
+ while (!list_empty(&ecryptfs_kthread_ctl.req_list)) {
+ req = list_first_entry(&ecryptfs_kthread_ctl.req_list,
+ struct ecryptfs_open_req,
+ kthread_ctl_list);
+ mutex_lock(&req->mux);
+ list_del(&req->kthread_ctl_list);
+ if (!(req->flags & ECRYPTFS_REQ_ZOMBIE)) {
+ dget(req->lower_dentry);
+ mntget(req->lower_mnt);
+ (*req->lower_file) = dentry_open(
+ req->lower_dentry, req->lower_mnt,
+ (O_RDWR | O_LARGEFILE));
+ req->flags |= ECRYPTFS_REQ_PROCESSED;
+ }
+ wake_up(&req->wait);
+ mutex_unlock(&req->mux);
+ }
+ mutex_unlock(&ecryptfs_kthread_ctl.mux);
+ }
+out:
+ return 0;
+}
+
+int ecryptfs_init_kthread(void)
+{
+ int rc = 0;
+
+ mutex_init(&ecryptfs_kthread_ctl.mux);
+ init_waitqueue_head(&ecryptfs_kthread_ctl.wait);
+ INIT_LIST_HEAD(&ecryptfs_kthread_ctl.req_list);
+ ecryptfs_kthread = kthread_run(&ecryptfs_threadfn, NULL,
+ "ecryptfs-kthread");
+ if (IS_ERR(ecryptfs_kthread)) {
+ rc = PTR_ERR(ecryptfs_kthread);
+ printk(KERN_ERR "%s: Failed to create kernel thread; rc = [%d]"
+ "\n", __func__, rc);
+ }
+ return rc;
+}
+
+void ecryptfs_destroy_kthread(void)
+{
+ struct ecryptfs_open_req *req;
+
+ mutex_lock(&ecryptfs_kthread_ctl.mux);
+ ecryptfs_kthread_ctl.flags |= ECRYPTFS_KTHREAD_ZOMBIE;
+ list_for_each_entry(req, &ecryptfs_kthread_ctl.req_list,
+ kthread_ctl_list) {
+ mutex_lock(&req->mux);
+ req->flags |= ECRYPTFS_REQ_ZOMBIE;
+ wake_up(&req->wait);
+ mutex_unlock(&req->mux);
+ }
+ mutex_unlock(&ecryptfs_kthread_ctl.mux);
+ kthread_stop(ecryptfs_kthread);
+ wake_up(&ecryptfs_kthread_ctl.wait);
+}
+
+/**
+ * ecryptfs_privileged_open
+ * @lower_file: Result of dentry_open by root on lower dentry
+ * @lower_dentry: Lower dentry for file to open
+ * @lower_mnt: Lower vfsmount for file to open
+ *
+ * This function gets a r/w file opened againt the lower dentry.
+ *
+ * Returns zero on success; non-zero otherwise
+ */
+int ecryptfs_privileged_open(struct file **lower_file,
+ struct dentry *lower_dentry,
+ struct vfsmount *lower_mnt)
+{
+ struct ecryptfs_open_req *req;
+ int rc = 0;
+
+ /* Corresponding dput() and mntput() are done when the
+ * persistent file is fput() when the eCryptfs inode is
+ * destroyed. */
+ dget(lower_dentry);
+ mntget(lower_mnt);
+ (*lower_file) = dentry_open(lower_dentry, lower_mnt,
+ (O_RDWR | O_LARGEFILE));
+ if (!IS_ERR(*lower_file))
+ goto out;
+ req = kmem_cache_alloc(ecryptfs_open_req_cache, GFP_KERNEL);
+ if (!req) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ mutex_init(&req->mux);
+ req->lower_file = lower_file;
+ req->lower_dentry = lower_dentry;
+ req->lower_mnt = lower_mnt;
+ init_waitqueue_head(&req->wait);
+ req->flags = 0;
+ mutex_lock(&ecryptfs_kthread_ctl.mux);
+ if (ecryptfs_kthread_ctl.flags & ECRYPTFS_KTHREAD_ZOMBIE) {
+ rc = -EIO;
+ mutex_unlock(&ecryptfs_kthread_ctl.mux);
+ printk(KERN_ERR "%s: We are in the middle of shutting down; "
+ "aborting privileged request to open lower file\n",
+ __func__);
+ goto out_free;
+ }
+ list_add_tail(&req->kthread_ctl_list, &ecryptfs_kthread_ctl.req_list);
+ mutex_unlock(&ecryptfs_kthread_ctl.mux);
+ wake_up(&ecryptfs_kthread_ctl.wait);
+ wait_event(req->wait, (req->flags != 0));
+ mutex_lock(&req->mux);
+ BUG_ON(req->flags == 0);
+ if (req->flags & ECRYPTFS_REQ_DROPPED
+ || req->flags & ECRYPTFS_REQ_ZOMBIE) {
+ rc = -EIO;
+ printk(KERN_WARNING "%s: Privileged open request dropped\n",
+ __func__);
+ goto out_unlock;
+ }
+ if (IS_ERR(*req->lower_file)) {
+ rc = PTR_ERR(*req->lower_file);
+ dget(lower_dentry);
+ mntget(lower_mnt);
+ (*lower_file) = dentry_open(lower_dentry, lower_mnt,
+ (O_RDONLY | O_LARGEFILE));
+ if (IS_ERR(*lower_file)) {
+ rc = PTR_ERR(*req->lower_file);
+ (*lower_file) = NULL;
+ printk(KERN_WARNING "%s: Error attempting privileged "
+ "open of lower file with either RW or RO "
+ "perms; rc = [%d]. Giving up.\n",
+ __func__, rc);
+ }
+ }
+out_unlock:
+ mutex_unlock(&req->mux);
+out_free:
+ kmem_cache_free(ecryptfs_open_req_cache, req);
+out:
+ return rc;
+}
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index d603631601e..8ebe9a5d1d9 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -117,7 +117,7 @@ void __ecryptfs_printk(const char *fmt, ...)
*
* Returns zero on success; non-zero otherwise
*/
-static int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
+int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
{
struct ecryptfs_inode_info *inode_info =
ecryptfs_inode_to_private(ecryptfs_dentry->d_inode);
@@ -130,26 +130,12 @@ static int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
ecryptfs_dentry_to_lower_mnt(ecryptfs_dentry);
lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
- /* Corresponding dput() and mntput() are done when the
- * persistent file is fput() when the eCryptfs inode
- * is destroyed. */
- dget(lower_dentry);
- mntget(lower_mnt);
- inode_info->lower_file = dentry_open(lower_dentry,
- lower_mnt,
- (O_RDWR | O_LARGEFILE));
- if (IS_ERR(inode_info->lower_file)) {
- dget(lower_dentry);
- mntget(lower_mnt);
- inode_info->lower_file = dentry_open(lower_dentry,
- lower_mnt,
- (O_RDONLY
- | O_LARGEFILE));
- }
- if (IS_ERR(inode_info->lower_file)) {
+ rc = ecryptfs_privileged_open(&inode_info->lower_file,
+ lower_dentry, lower_mnt);
+ if (rc || IS_ERR(inode_info->lower_file)) {
printk(KERN_ERR "Error opening lower persistent file "
- "for lower_dentry [0x%p] and lower_mnt [0x%p]\n",
- lower_dentry, lower_mnt);
+ "for lower_dentry [0x%p] and lower_mnt [0x%p]; "
+ "rc = [%d]\n", lower_dentry, lower_mnt, rc);
rc = PTR_ERR(inode_info->lower_file);
inode_info->lower_file = NULL;
}
@@ -163,14 +149,14 @@ static int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
* @lower_dentry: Existing dentry in the lower filesystem
* @dentry: ecryptfs' dentry
* @sb: ecryptfs's super_block
- * @flag: If set to true, then d_add is called, else d_instantiate is called
+ * @flags: flags to govern behavior of interpose procedure
*
* Interposes upper and lower dentries.
*
* Returns zero on success; non-zero otherwise
*/
int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
- struct super_block *sb, int flag)
+ struct super_block *sb, u32 flags)
{
struct inode *lower_inode;
struct inode *inode;
@@ -207,7 +193,7 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
init_special_inode(inode, lower_inode->i_mode,
lower_inode->i_rdev);
dentry->d_op = &ecryptfs_dops;
- if (flag)
+ if (flags & ECRYPTFS_INTERPOSE_FLAG_D_ADD)
d_add(dentry, inode);
else
d_instantiate(dentry, inode);
@@ -215,13 +201,6 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
/* This size will be overwritten for real files w/ headers and
* other metadata */
fsstack_copy_inode_size(inode, lower_inode);
- rc = ecryptfs_init_persistent_file(dentry);
- if (rc) {
- printk(KERN_ERR "%s: Error attempting to initialize the "
- "persistent file for the dentry with name [%s]; "
- "rc = [%d]\n", __func__, dentry->d_name.name, rc);
- goto out;
- }
out:
return rc;
}
@@ -232,7 +211,7 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig,
ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata,
ecryptfs_opt_encrypted_view, ecryptfs_opt_err };
-static match_table_t tokens = {
+static const match_table_t tokens = {
{ecryptfs_opt_sig, "sig=%s"},
{ecryptfs_opt_ecryptfs_sig, "ecryptfs_sig=%s"},
{ecryptfs_opt_cipher, "cipher=%s"},
@@ -262,10 +241,11 @@ static int ecryptfs_init_global_auth_toks(
"session keyring for sig specified in mount "
"option: [%s]\n", global_auth_tok->sig);
global_auth_tok->flags |= ECRYPTFS_AUTH_TOK_INVALID;
- rc = 0;
+ goto out;
} else
global_auth_tok->flags &= ~ECRYPTFS_AUTH_TOK_INVALID;
}
+out:
return rc;
}
@@ -314,7 +294,6 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
char *cipher_name_dst;
char *cipher_name_src;
char *cipher_key_bytes_src;
- int cipher_name_len;
if (!options) {
rc = -EINVAL;
@@ -395,17 +374,12 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
goto out;
}
if (!cipher_name_set) {
- cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER);
- if (unlikely(cipher_name_len
- >= ECRYPTFS_MAX_CIPHER_NAME_SIZE)) {
- rc = -EINVAL;
- BUG();
- goto out;
- }
- memcpy(mount_crypt_stat->global_default_cipher_name,
- ECRYPTFS_DEFAULT_CIPHER, cipher_name_len);
- mount_crypt_stat->global_default_cipher_name[cipher_name_len]
- = '\0';
+ int cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER);
+
+ BUG_ON(cipher_name_len >= ECRYPTFS_MAX_CIPHER_NAME_SIZE);
+
+ strcpy(mount_crypt_stat->global_default_cipher_name,
+ ECRYPTFS_DEFAULT_CIPHER);
}
if (!cipher_key_bytes_set) {
mount_crypt_stat->global_default_cipher_key_size = 0;
@@ -430,7 +404,6 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
printk(KERN_WARNING "One or more global auth toks could not "
"properly register; rc = [%d]\n", rc);
}
- rc = 0;
out:
return rc;
}
@@ -605,7 +578,7 @@ static struct file_system_type ecryptfs_fs_type = {
* Initializes the ecryptfs_inode_info_cache when it is created
*/
static void
-inode_info_init_once(struct kmem_cache *cachep, void *vptr)
+inode_info_init_once(void *vptr)
{
struct ecryptfs_inode_info *ei = (struct ecryptfs_inode_info *)vptr;
@@ -616,7 +589,7 @@ static struct ecryptfs_cache_info {
struct kmem_cache **cache;
const char *name;
size_t size;
- void (*ctor)(struct kmem_cache *cache, void *obj);
+ void (*ctor)(void *obj);
} ecryptfs_cache_infos[] = {
{
.cache = &ecryptfs_auth_tok_list_item_cache,
@@ -679,6 +652,11 @@ static struct ecryptfs_cache_info {
.name = "ecryptfs_key_tfm_cache",
.size = sizeof(struct ecryptfs_key_tfm),
},
+ {
+ .cache = &ecryptfs_open_req_cache,
+ .name = "ecryptfs_open_req_cache",
+ .size = sizeof(struct ecryptfs_open_req),
+ },
};
static void ecryptfs_free_kmem_caches(void)
@@ -795,11 +773,17 @@ static int __init ecryptfs_init(void)
printk(KERN_ERR "sysfs registration failed\n");
goto out_unregister_filesystem;
}
+ rc = ecryptfs_init_kthread();
+ if (rc) {
+ printk(KERN_ERR "%s: kthread initialization failed; "
+ "rc = [%d]\n", __func__, rc);
+ goto out_do_sysfs_unregistration;
+ }
rc = ecryptfs_init_messaging(ecryptfs_transport);
if (rc) {
- ecryptfs_printk(KERN_ERR, "Failure occured while attempting to "
+ printk(KERN_ERR "Failure occured while attempting to "
"initialize the eCryptfs netlink socket\n");
- goto out_do_sysfs_unregistration;
+ goto out_destroy_kthread;
}
rc = ecryptfs_init_crypto();
if (rc) {
@@ -814,6 +798,8 @@ static int __init ecryptfs_init(void)
goto out;
out_release_messaging:
ecryptfs_release_messaging(ecryptfs_transport);
+out_destroy_kthread:
+ ecryptfs_destroy_kthread();
out_do_sysfs_unregistration:
do_sysfs_unregistration();
out_unregister_filesystem:
@@ -833,6 +819,7 @@ static void __exit ecryptfs_exit(void)
printk(KERN_ERR "Failure whilst attempting to destroy crypto; "
"rc = [%d]\n", rc);
ecryptfs_release_messaging(ecryptfs_transport);
+ ecryptfs_destroy_kthread();
do_sysfs_unregistration();
unregister_filesystem(&ecryptfs_fs_type);
ecryptfs_free_kmem_caches();
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 50c994a249a..b484792a099 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -358,46 +358,6 @@ out_unlock_daemon:
}
/**
- * ecryptfs_miscdev_helo
- * @euid: effective user id of miscdevess sending helo packet
- * @user_ns: The namespace in which @euid applies
- * @pid: miscdevess id of miscdevess sending helo packet
- *
- * Returns zero on success; non-zero otherwise
- */
-static int ecryptfs_miscdev_helo(uid_t euid, struct user_namespace *user_ns,
- struct pid *pid)
-{
- int rc;
-
- rc = ecryptfs_process_helo(ECRYPTFS_TRANSPORT_MISCDEV, euid, user_ns,
- pid);
- if (rc)
- printk(KERN_WARNING "Error processing HELO; rc = [%d]\n", rc);
- return rc;
-}
-
-/**
- * ecryptfs_miscdev_quit
- * @euid: effective user id of miscdevess sending quit packet
- * @user_ns: The namespace in which @euid applies
- * @pid: miscdevess id of miscdevess sending quit packet
- *
- * Returns zero on success; non-zero otherwise
- */
-static int ecryptfs_miscdev_quit(uid_t euid, struct user_namespace *user_ns,
- struct pid *pid)
-{
- int rc;
-
- rc = ecryptfs_process_quit(euid, user_ns, pid);
- if (rc)
- printk(KERN_WARNING
- "Error processing QUIT message; rc = [%d]\n", rc);
- return rc;
-}
-
-/**
* ecryptfs_miscdev_response - miscdevess response to message previously sent to daemon
* @data: Bytes comprising struct ecryptfs_message
* @data_size: sizeof(struct ecryptfs_message) + data len
@@ -512,26 +472,7 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
__func__, rc);
break;
case ECRYPTFS_MSG_HELO:
- rc = ecryptfs_miscdev_helo(current->euid,
- current->nsproxy->user_ns,
- task_pid(current));
- if (rc) {
- printk(KERN_ERR "%s: Error attempting to process "
- "helo from pid [0x%p]; rc = [%d]\n", __func__,
- task_pid(current), rc);
- goto out_free;
- }
- break;
case ECRYPTFS_MSG_QUIT:
- rc = ecryptfs_miscdev_quit(current->euid,
- current->nsproxy->user_ns,
- task_pid(current));
- if (rc) {
- printk(KERN_ERR "%s: Error attempting to process "
- "quit from pid [0x%p]; rc = [%d]\n", __func__,
- task_pid(current), rc);
- goto out_free;
- }
break;
default:
ecryptfs_printk(KERN_WARNING, "Dropping miscdev "
@@ -575,13 +516,11 @@ int ecryptfs_init_ecryptfs_miscdev(void)
int rc;
atomic_set(&ecryptfs_num_miscdev_opens, 0);
- mutex_lock(&ecryptfs_daemon_hash_mux);
rc = misc_register(&ecryptfs_miscdev);
if (rc)
printk(KERN_ERR "%s: Failed to register miscellaneous device "
"for communications with userspace daemons; rc = [%d]\n",
__func__, rc);
- mutex_unlock(&ecryptfs_daemon_hash_mux);
return rc;
}
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 2b6fe1e6e8b..245c2dc02d5 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -32,6 +32,7 @@
#include <linux/file.h>
#include <linux/crypto.h>
#include <linux/scatterlist.h>
+#include <asm/unaligned.h>
#include "ecryptfs_kernel.h"
/**
@@ -372,7 +373,6 @@ out:
*/
static int ecryptfs_write_inode_size_to_header(struct inode *ecryptfs_inode)
{
- u64 file_size;
char *file_size_virt;
int rc;
@@ -381,9 +381,7 @@ static int ecryptfs_write_inode_size_to_header(struct inode *ecryptfs_inode)
rc = -ENOMEM;
goto out;
}
- file_size = (u64)i_size_read(ecryptfs_inode);
- file_size = cpu_to_be64(file_size);
- memcpy(file_size_virt, &file_size, sizeof(u64));
+ put_unaligned_be64(i_size_read(ecryptfs_inode), file_size_virt);
rc = ecryptfs_write_lower(ecryptfs_inode, file_size_virt, 0,
sizeof(u64));
kfree(file_size_virt);
@@ -403,7 +401,6 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode)
struct dentry *lower_dentry =
ecryptfs_inode_to_private(ecryptfs_inode)->lower_file->f_dentry;
struct inode *lower_inode = lower_dentry->d_inode;
- u64 file_size;
int rc;
if (!lower_inode->i_op->getxattr || !lower_inode->i_op->setxattr) {
@@ -424,9 +421,7 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode)
xattr_virt, PAGE_CACHE_SIZE);
if (size < 0)
size = 8;
- file_size = (u64)i_size_read(ecryptfs_inode);
- file_size = cpu_to_be64(file_size);
- memcpy(xattr_virt, &file_size, sizeof(u64));
+ put_unaligned_be64(i_size_read(ecryptfs_inode), xattr_virt);
rc = lower_inode->i_op->setxattr(lower_dentry, ECRYPTFS_XATTR_NAME,
xattr_virt, size, 0);
mutex_unlock(&lower_inode->i_mutex);
diff --git a/fs/efs/namei.c b/fs/efs/namei.c
index 3a404e7fad5..291abb11e20 100644
--- a/fs/efs/namei.c
+++ b/fs/efs/namei.c
@@ -74,8 +74,7 @@ struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry, struct namei
}
unlock_kernel();
- d_add(dentry, inode);
- return NULL;
+ return d_splice_alias(inode, dentry);
}
static struct inode *efs_nfs_get_inode(struct super_block *sb, u64 ino,
diff --git a/fs/efs/super.c b/fs/efs/super.c
index d733531b55e..73b19cfc91f 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -70,7 +70,7 @@ static void efs_destroy_inode(struct inode *inode)
kmem_cache_free(efs_inode_cachep, INODE_INFO(inode));
}
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
{
struct efs_inode_info *ei = (struct efs_inode_info *) foo;
@@ -341,8 +341,6 @@ static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) {
sb->inode_blocks *
(EFS_BLOCKSIZE / sizeof(struct efs_dinode));
buf->f_ffree = sb->inode_free; /* free inodes */
- buf->f_fsid.val[0] = (sb->fs_magic >> 16) & 0xffff; /* fs ID */
- buf->f_fsid.val[1] = sb->fs_magic & 0xffff; /* fs ID */
buf->f_namelen = EFS_MAXNAMELEN; /* max filename length */
return 0;
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 343942deeec..08bf558d040 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -198,11 +198,18 @@ struct file *eventfd_fget(int fd)
return file;
}
-asmlinkage long sys_eventfd(unsigned int count)
+asmlinkage long sys_eventfd2(unsigned int count, int flags)
{
int fd;
struct eventfd_ctx *ctx;
+ /* Check the EFD_* constants for consistency. */
+ BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
+ BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
+
+ if (flags & ~(EFD_CLOEXEC | EFD_NONBLOCK))
+ return -EINVAL;
+
ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
return -ENOMEM;
@@ -214,9 +221,15 @@ asmlinkage long sys_eventfd(unsigned int count)
* When we call this, the initialization must be complete, since
* anon_inode_getfd() will install the fd.
*/
- fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx);
+ fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx,
+ flags & (O_CLOEXEC | O_NONBLOCK));
if (fd < 0)
kfree(ctx);
return fd;
}
+asmlinkage long sys_eventfd(unsigned int count)
+{
+ return sys_eventfd2(count, 0);
+}
+
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 990c01d2d66..7cc0eb756b5 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1041,25 +1041,27 @@ retry:
}
/*
- * It opens an eventpoll file descriptor. The "size" parameter is there
- * for historical reasons, when epoll was using an hash instead of an
- * RB tree. With the current implementation, the "size" parameter is ignored
- * (besides sanity checks).
+ * Open an eventpoll file descriptor.
*/
-asmlinkage long sys_epoll_create(int size)
+asmlinkage long sys_epoll_create1(int flags)
{
int error, fd = -1;
struct eventpoll *ep;
+ /* Check the EPOLL_* constant for consistency. */
+ BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
+
+ if (flags & ~EPOLL_CLOEXEC)
+ return -EINVAL;
+
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
- current, size));
+ current, flags));
/*
- * Sanity check on the size parameter, and create the internal data
- * structure ( "struct eventpoll" ).
+ * Create the internal data structure ( "struct eventpoll" ).
*/
- error = -EINVAL;
- if (size <= 0 || (error = ep_alloc(&ep)) < 0) {
+ error = ep_alloc(&ep);
+ if (error < 0) {
fd = error;
goto error_return;
}
@@ -1068,17 +1070,26 @@ asmlinkage long sys_epoll_create(int size)
* Creates all the items needed to setup an eventpoll file. That is,
* a file structure and a free file descriptor.
*/
- fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep);
+ fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
+ flags & O_CLOEXEC);
if (fd < 0)
ep_free(ep);
error_return:
DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
- current, size, fd));
+ current, flags, fd));
return fd;
}
+asmlinkage long sys_epoll_create(int size)
+{
+ if (size < 0)
+ return -EINVAL;
+
+ return sys_epoll_create1(0);
+}
+
/*
* The following function implements the controller interface for
* the eventpoll file that enables the insertion/removal/change of
diff --git a/fs/exec.c b/fs/exec.c
index da94a6f05df..cecee501ce7 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -25,10 +25,11 @@
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/fdtable.h>
-#include <linux/mman.h>
+#include <linux/mm.h>
#include <linux/stat.h>
#include <linux/fcntl.h>
#include <linux/smp_lock.h>
+#include <linux/swap.h>
#include <linux/string.h>
#include <linux/init.h>
#include <linux/pagemap.h>
@@ -37,20 +38,18 @@
#include <linux/key.h>
#include <linux/personality.h>
#include <linux/binfmts.h>
-#include <linux/swap.h>
#include <linux/utsname.h>
#include <linux/pid_namespace.h>
#include <linux/module.h>
#include <linux/namei.h>
#include <linux/proc_fs.h>
-#include <linux/ptrace.h>
#include <linux/mount.h>
#include <linux/security.h>
#include <linux/syscalls.h>
-#include <linux/rmap.h>
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/audit.h>
+#include <linux/tracehook.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
@@ -108,11 +107,17 @@ static inline void put_binfmt(struct linux_binfmt * fmt)
*/
asmlinkage long sys_uselib(const char __user * library)
{
- struct file * file;
+ struct file *file;
struct nameidata nd;
- int error;
-
- error = __user_path_lookup_open(library, LOOKUP_FOLLOW, &nd, FMODE_READ|FMODE_EXEC);
+ char *tmp = getname(library);
+ int error = PTR_ERR(tmp);
+
+ if (!IS_ERR(tmp)) {
+ error = path_lookup_open(AT_FDCWD, tmp,
+ LOOKUP_FOLLOW, &nd,
+ FMODE_READ|FMODE_EXEC);
+ putname(tmp);
+ }
if (error)
goto out;
@@ -120,7 +125,11 @@ asmlinkage long sys_uselib(const char __user * library)
if (!S_ISREG(nd.path.dentry->d_inode->i_mode))
goto exit;
- error = vfs_permission(&nd, MAY_READ | MAY_EXEC);
+ error = -EACCES;
+ if (nd.path.mnt->mnt_flags & MNT_NOEXEC)
+ goto exit;
+
+ error = vfs_permission(&nd, MAY_READ | MAY_EXEC | MAY_OPEN);
if (error)
goto exit;
@@ -541,7 +550,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
/*
* when the old and new regions overlap clear from new_end.
*/
- free_pgd_range(&tlb, new_end, old_end, new_end,
+ free_pgd_range(tlb, new_end, old_end, new_end,
vma->vm_next ? vma->vm_next->vm_start : 0);
} else {
/*
@@ -550,7 +559,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
* have constraints on va-space that make this illegal (IA64) -
* for the others its just a little faster.
*/
- free_pgd_range(&tlb, old_start, old_end, new_end,
+ free_pgd_range(tlb, old_start, old_end, new_end,
vma->vm_next ? vma->vm_next->vm_start : 0);
}
tlb_finish_mmu(tlb, new_end, old_end);
@@ -610,7 +619,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
bprm->exec -= stack_shift;
down_write(&mm->mmap_sem);
- vm_flags = vma->vm_flags;
+ vm_flags = VM_STACK_FLAGS;
/*
* Adjust stack execute permissions; explicitly enable for
@@ -658,38 +667,43 @@ EXPORT_SYMBOL(setup_arg_pages);
struct file *open_exec(const char *name)
{
struct nameidata nd;
- int err;
struct file *file;
+ int err;
- err = path_lookup_open(AT_FDCWD, name, LOOKUP_FOLLOW, &nd, FMODE_READ|FMODE_EXEC);
- file = ERR_PTR(err);
-
- if (!err) {
- struct inode *inode = nd.path.dentry->d_inode;
- file = ERR_PTR(-EACCES);
- if (S_ISREG(inode->i_mode)) {
- int err = vfs_permission(&nd, MAY_EXEC);
- file = ERR_PTR(err);
- if (!err) {
- file = nameidata_to_filp(&nd,
- O_RDONLY|O_LARGEFILE);
- if (!IS_ERR(file)) {
- err = deny_write_access(file);
- if (err) {
- fput(file);
- file = ERR_PTR(err);
- }
- }
-out:
- return file;
- }
- }
- release_open_intent(&nd);
- path_put(&nd.path);
+ err = path_lookup_open(AT_FDCWD, name, LOOKUP_FOLLOW, &nd,
+ FMODE_READ|FMODE_EXEC);
+ if (err)
+ goto out;
+
+ err = -EACCES;
+ if (!S_ISREG(nd.path.dentry->d_inode->i_mode))
+ goto out_path_put;
+
+ if (nd.path.mnt->mnt_flags & MNT_NOEXEC)
+ goto out_path_put;
+
+ err = vfs_permission(&nd, MAY_EXEC | MAY_OPEN);
+ if (err)
+ goto out_path_put;
+
+ file = nameidata_to_filp(&nd, O_RDONLY|O_LARGEFILE);
+ if (IS_ERR(file))
+ return file;
+
+ err = deny_write_access(file);
+ if (err) {
+ fput(file);
+ goto out;
}
- goto out;
-}
+ return file;
+
+ out_path_put:
+ release_open_intent(&nd);
+ path_put(&nd.path);
+ out:
+ return ERR_PTR(err);
+}
EXPORT_SYMBOL(open_exec);
int kernel_read(struct file *file, unsigned long offset,
@@ -724,12 +738,10 @@ static int exec_mmap(struct mm_struct *mm)
* Make sure that if there is a core dump in progress
* for the old mm, we get out and die instead of going
* through with the exec. We must hold mmap_sem around
- * checking core_waiters and changing tsk->mm. The
- * core-inducing thread will increment core_waiters for
- * each thread whose ->mm == old_mm.
+ * checking core_state and changing tsk->mm.
*/
down_read(&old_mm->mmap_sem);
- if (unlikely(old_mm->core_waiters)) {
+ if (unlikely(old_mm->core_state)) {
up_read(&old_mm->mmap_sem);
return -EINTR;
}
@@ -740,11 +752,11 @@ static int exec_mmap(struct mm_struct *mm)
tsk->active_mm = mm;
activate_mm(active_mm, mm);
task_unlock(tsk);
- mm_update_next_owner(old_mm);
arch_pick_mmap_layout(mm);
if (old_mm) {
up_read(&old_mm->mmap_sem);
BUG_ON(active_mm != old_mm);
+ mm_update_next_owner(old_mm);
mmput(old_mm);
return 0;
}
@@ -1075,13 +1087,8 @@ EXPORT_SYMBOL(prepare_binprm);
static int unsafe_exec(struct task_struct *p)
{
- int unsafe = 0;
- if (p->ptrace & PT_PTRACED) {
- if (p->ptrace & PT_PTRACE_CAP)
- unsafe |= LSM_UNSAFE_PTRACE_CAP;
- else
- unsafe |= LSM_UNSAFE_PTRACE;
- }
+ int unsafe = tracehook_unsafe_exec(p);
+
if (atomic_read(&p->fs->count) > 1 ||
atomic_read(&p->files->count) > 1 ||
atomic_read(&p->sighand->count) > 1)
@@ -1218,6 +1225,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
read_unlock(&binfmt_lock);
retval = fn(bprm, regs);
if (retval >= 0) {
+ tracehook_report_exec(fmt, bprm, regs);
put_binfmt(fmt);
allow_write_access(bprm->file);
if (bprm->file)
@@ -1328,6 +1336,7 @@ int do_execve(char * filename,
if (retval < 0)
goto out;
+ current->flags &= ~PF_KTHREAD;
retval = search_binary_handler(bprm,regs);
if (retval >= 0) {
/* execve success */
@@ -1382,17 +1391,14 @@ EXPORT_SYMBOL(set_binfmt);
* name into corename, which must have space for at least
* CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
*/
-static int format_corename(char *corename, const char *pattern, long signr)
+static int format_corename(char *corename, int nr_threads, long signr)
{
- const char *pat_ptr = pattern;
+ const char *pat_ptr = core_pattern;
+ int ispipe = (*pat_ptr == '|');
char *out_ptr = corename;
char *const out_end = corename + CORENAME_MAX_SIZE;
int rc;
int pid_in_pattern = 0;
- int ispipe = 0;
-
- if (*pattern == '|')
- ispipe = 1;
/* Repeat as long as we have more pattern to process and more output
space */
@@ -1493,7 +1499,7 @@ static int format_corename(char *corename, const char *pattern, long signr)
* and core_uses_pid is set, then .%pid will be appended to
* the filename. Do not do this for piped commands. */
if (!ispipe && !pid_in_pattern
- && (core_uses_pid || atomic_read(&current->mm->mm_users) != 1)) {
+ && (core_uses_pid || nr_threads)) {
rc = snprintf(out_ptr, out_end - out_ptr,
".%d", task_tgid_vnr(current));
if (rc > out_end - out_ptr)
@@ -1505,9 +1511,10 @@ out:
return ispipe;
}
-static void zap_process(struct task_struct *start)
+static int zap_process(struct task_struct *start)
{
struct task_struct *t;
+ int nr = 0;
start->signal->flags = SIGNAL_GROUP_EXIT;
start->signal->group_stop_count = 0;
@@ -1515,72 +1522,99 @@ static void zap_process(struct task_struct *start)
t = start;
do {
if (t != current && t->mm) {
- t->mm->core_waiters++;
sigaddset(&t->pending.signal, SIGKILL);
signal_wake_up(t, 1);
+ nr++;
}
- } while ((t = next_thread(t)) != start);
+ } while_each_thread(start, t);
+
+ return nr;
}
static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
- int exit_code)
+ struct core_state *core_state, int exit_code)
{
struct task_struct *g, *p;
unsigned long flags;
- int err = -EAGAIN;
+ int nr = -EAGAIN;
spin_lock_irq(&tsk->sighand->siglock);
if (!signal_group_exit(tsk->signal)) {
+ mm->core_state = core_state;
tsk->signal->group_exit_code = exit_code;
- zap_process(tsk);
- err = 0;
+ nr = zap_process(tsk);
}
spin_unlock_irq(&tsk->sighand->siglock);
- if (err)
- return err;
+ if (unlikely(nr < 0))
+ return nr;
- if (atomic_read(&mm->mm_users) == mm->core_waiters + 1)
+ if (atomic_read(&mm->mm_users) == nr + 1)
goto done;
-
+ /*
+ * We should find and kill all tasks which use this mm, and we should
+ * count them correctly into ->nr_threads. We don't take tasklist
+ * lock, but this is safe wrt:
+ *
+ * fork:
+ * None of sub-threads can fork after zap_process(leader). All
+ * processes which were created before this point should be
+ * visible to zap_threads() because copy_process() adds the new
+ * process to the tail of init_task.tasks list, and lock/unlock
+ * of ->siglock provides a memory barrier.
+ *
+ * do_exit:
+ * The caller holds mm->mmap_sem. This means that the task which
+ * uses this mm can't pass exit_mm(), so it can't exit or clear
+ * its ->mm.
+ *
+ * de_thread:
+ * It does list_replace_rcu(&leader->tasks, &current->tasks),
+ * we must see either old or new leader, this does not matter.
+ * However, it can change p->sighand, so lock_task_sighand(p)
+ * must be used. Since p->mm != NULL and we hold ->mmap_sem
+ * it can't fail.
+ *
+ * Note also that "g" can be the old leader with ->mm == NULL
+ * and already unhashed and thus removed from ->thread_group.
+ * This is OK, __unhash_process()->list_del_rcu() does not
+ * clear the ->next pointer, we will find the new leader via
+ * next_thread().
+ */
rcu_read_lock();
for_each_process(g) {
if (g == tsk->group_leader)
continue;
-
+ if (g->flags & PF_KTHREAD)
+ continue;
p = g;
do {
if (p->mm) {
- if (p->mm == mm) {
- /*
- * p->sighand can't disappear, but
- * may be changed by de_thread()
- */
+ if (unlikely(p->mm == mm)) {
lock_task_sighand(p, &flags);
- zap_process(p);
+ nr += zap_process(p);
unlock_task_sighand(p, &flags);
}
break;
}
- } while ((p = next_thread(p)) != g);
+ } while_each_thread(g, p);
}
rcu_read_unlock();
done:
- return mm->core_waiters;
+ atomic_set(&core_state->nr_threads, nr);
+ return nr;
}
-static int coredump_wait(int exit_code)
+static int coredump_wait(int exit_code, struct core_state *core_state)
{
struct task_struct *tsk = current;
struct mm_struct *mm = tsk->mm;
- struct completion startup_done;
struct completion *vfork_done;
int core_waiters;
- init_completion(&mm->core_done);
- init_completion(&startup_done);
- mm->core_startup_done = &startup_done;
-
- core_waiters = zap_threads(tsk, mm, exit_code);
+ init_completion(&core_state->startup);
+ core_state->dumper.task = tsk;
+ core_state->dumper.next = NULL;
+ core_waiters = zap_threads(tsk, mm, core_state, exit_code);
up_write(&mm->mmap_sem);
if (unlikely(core_waiters < 0))
@@ -1597,12 +1631,32 @@ static int coredump_wait(int exit_code)
}
if (core_waiters)
- wait_for_completion(&startup_done);
+ wait_for_completion(&core_state->startup);
fail:
- BUG_ON(mm->core_waiters);
return core_waiters;
}
+static void coredump_finish(struct mm_struct *mm)
+{
+ struct core_thread *curr, *next;
+ struct task_struct *task;
+
+ next = mm->core_state->dumper.next;
+ while ((curr = next) != NULL) {
+ next = curr->next;
+ task = curr->task;
+ /*
+ * see exit_mm(), curr->task must not see
+ * ->task == NULL before we read ->next.
+ */
+ smp_mb();
+ curr->task = NULL;
+ wake_up_process(task);
+ }
+
+ mm->core_state = NULL;
+}
+
/*
* set_dumpable converts traditional three-value dumpable to two flags and
* stores them into mm->flags. It modifies lower two bits of mm->flags, but
@@ -1654,6 +1708,7 @@ int get_dumpable(struct mm_struct *mm)
int do_coredump(long signr, int exit_code, struct pt_regs * regs)
{
+ struct core_state core_state;
char corename[CORENAME_MAX_SIZE + 1];
struct mm_struct *mm = current->mm;
struct linux_binfmt * binfmt;
@@ -1677,7 +1732,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
/*
* If another thread got here first, or we are not dumpable, bail out.
*/
- if (mm->core_waiters || !get_dumpable(mm)) {
+ if (mm->core_state || !get_dumpable(mm)) {
up_write(&mm->mmap_sem);
goto fail;
}
@@ -1692,7 +1747,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
current->fsuid = 0; /* Dump root private */
}
- retval = coredump_wait(exit_code);
+ retval = coredump_wait(exit_code, &core_state);
if (retval < 0)
goto fail;
@@ -1707,7 +1762,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
* uses lock_kernel()
*/
lock_kernel();
- ispipe = format_corename(corename, core_pattern, signr);
+ ispipe = format_corename(corename, retval, signr);
unlock_kernel();
/*
* Don't bother to check the RLIMIT_CORE value if core_pattern points
@@ -1786,7 +1841,7 @@ fail_unlock:
argv_free(helper_argv);
current->fsuid = fsuid;
- complete_all(&mm->core_done);
+ coredump_finish(mm);
fail:
return retval;
}
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index e58669e1b87..ae8c4f850b2 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -294,7 +294,7 @@ ext2_check_acl(struct inode *inode, int mask)
}
int
-ext2_permission(struct inode *inode, int mask, struct nameidata *nd)
+ext2_permission(struct inode *inode, int mask)
{
return generic_permission(inode, mask, ext2_check_acl);
}
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index 0bde85bafe3..b42cf578554 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -58,7 +58,7 @@ static inline int ext2_acl_count(size_t size)
#define EXT2_ACL_NOT_CACHED ((void *)-1)
/* acl.c */
-extern int ext2_permission (struct inode *, int, struct nameidata *);
+extern int ext2_permission (struct inode *, int);
extern int ext2_acl_chmod (struct inode *);
extern int ext2_init_acl (struct inode *, struct inode *);
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 47d88da2d33..bae998c1e44 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -133,6 +133,8 @@ extern void ext2_truncate (struct inode *);
extern int ext2_setattr (struct dentry *, struct iattr *);
extern void ext2_set_inode_flags(struct inode *inode);
extern void ext2_get_inode_flags(struct ext2_inode_info *);
+extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len);
int __ext2_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 5f2fa9c3629..45ed0712218 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -86,4 +86,5 @@ const struct inode_operations ext2_file_inode_operations = {
#endif
.setattr = ext2_setattr,
.permission = ext2_permission,
+ .fiemap = ext2_fiemap,
};
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 384fc0d1dd7..7658b33e265 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -31,6 +31,7 @@
#include <linux/writeback.h>
#include <linux/buffer_head.h>
#include <linux/mpage.h>
+#include <linux/fiemap.h>
#include "ext2.h"
#include "acl.h"
#include "xip.h"
@@ -704,6 +705,13 @@ int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_
}
+int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len)
+{
+ return generic_block_fiemap(inode, fieinfo, start, len,
+ ext2_get_block);
+}
+
static int ext2_writepage(struct page *page, struct writeback_control *wbc)
{
return block_write_full_page(page, ext2_get_block, wbc);
@@ -791,6 +799,7 @@ const struct address_space_operations ext2_aops = {
.direct_IO = ext2_direct_IO,
.writepages = ext2_writepages,
.migratepage = buffer_migrate_page,
+ .is_partially_uptodate = block_is_partially_uptodate,
};
const struct address_space_operations ext2_aops_xip = {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index ef50cbc792d..647cd888ac8 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -31,6 +31,7 @@
#include <linux/seq_file.h>
#include <linux/mount.h>
#include <linux/log2.h>
+#include <linux/quotaops.h>
#include <asm/uaccess.h>
#include "ext2.h"
#include "xattr.h"
@@ -158,7 +159,7 @@ static void ext2_destroy_inode(struct inode *inode)
kmem_cache_free(ext2_inode_cachep, EXT2_I(inode));
}
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
{
struct ext2_inode_info *ei = (struct ext2_inode_info *) foo;
@@ -392,7 +393,7 @@ enum {
Opt_usrquota, Opt_grpquota, Opt_reservation, Opt_noreservation
};
-static match_table_t tokens = {
+static const match_table_t tokens = {
{Opt_bsd_df, "bsddf"},
{Opt_minix_df, "minixdf"},
{Opt_grpid, "grpid"},
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index eaa23d2d521..70c0dbdcdcb 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -14,7 +14,7 @@ static size_t
ext2_xattr_security_list(struct inode *inode, char *list, size_t list_size,
const char *name, size_t name_len)
{
- const int prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1;
+ const int prefix_len = XATTR_SECURITY_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
if (list && total_len <= list_size) {
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index 83ee149f353..e8219f8eae9 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -12,13 +12,11 @@
#include <linux/ext2_fs.h>
#include "xattr.h"
-#define XATTR_TRUSTED_PREFIX "trusted."
-
static size_t
ext2_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
const char *name, size_t name_len)
{
- const int prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
+ const int prefix_len = XATTR_TRUSTED_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
if (!capable(CAP_SYS_ADMIN))
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index f383e7c3a7b..92495d28c62 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -11,13 +11,11 @@
#include "ext2.h"
#include "xattr.h"
-#define XATTR_USER_PREFIX "user."
-
static size_t
ext2_xattr_user_list(struct inode *inode, char *list, size_t list_size,
const char *name, size_t name_len)
{
- const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1;
+ const size_t prefix_len = XATTR_USER_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
if (!test_opt(inode->i_sb, XATTR_USER))
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index a754d184817..b60bb241880 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -299,7 +299,7 @@ ext3_check_acl(struct inode *inode, int mask)
}
int
-ext3_permission(struct inode *inode, int mask, struct nameidata *nd)
+ext3_permission(struct inode *inode, int mask)
{
return generic_permission(inode, mask, ext3_check_acl);
}
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
index 0d1e6279cbf..42da16b8cac 100644
--- a/fs/ext3/acl.h
+++ b/fs/ext3/acl.h
@@ -58,7 +58,7 @@ static inline int ext3_acl_count(size_t size)
#define EXT3_ACL_NOT_CACHED ((void *)-1)
/* acl.c */
-extern int ext3_permission (struct inode *, int, struct nameidata *);
+extern int ext3_permission (struct inode *, int);
extern int ext3_acl_chmod (struct inode *);
extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 8ca3bfd7242..2eea96ec78e 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -272,7 +272,7 @@ static void free_rb_tree_fname(struct rb_root *root)
while (n) {
/* Do the node's children first */
- if ((n)->rb_left) {
+ if (n->rb_left) {
n = n->rb_left;
continue;
}
@@ -301,24 +301,18 @@ static void free_rb_tree_fname(struct rb_root *root)
parent->rb_right = NULL;
n = parent;
}
- root->rb_node = NULL;
}
-static struct dir_private_info *create_dir_info(loff_t pos)
+static struct dir_private_info *ext3_htree_create_dir_info(loff_t pos)
{
struct dir_private_info *p;
- p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
+ p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
if (!p)
return NULL;
- p->root.rb_node = NULL;
- p->curr_node = NULL;
- p->extra_fname = NULL;
- p->last_pos = 0;
p->curr_hash = pos2maj_hash(pos);
p->curr_minor_hash = pos2min_hash(pos);
- p->next_hash = 0;
return p;
}
@@ -433,7 +427,7 @@ static int ext3_dx_readdir(struct file * filp,
int ret;
if (!info) {
- info = create_dir_info(filp->f_pos);
+ info = ext3_htree_create_dir_info(filp->f_pos);
if (!info)
return -ENOMEM;
filp->private_data = info;
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index acc4913d301..3be1e0689c9 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -134,5 +134,6 @@ const struct inode_operations ext3_file_inode_operations = {
.removexattr = generic_removexattr,
#endif
.permission = ext3_permission,
+ .fiemap = ext3_fiemap,
};
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 77126821b2e..47b678d73e7 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -669,6 +669,14 @@ struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino)
if (IS_ERR(inode))
goto iget_failed;
+ /*
+ * If the orphans has i_nlinks > 0 then it should be able to be
+ * truncated, otherwise it won't be removed from the orphan list
+ * during processing and an infinite loop will result.
+ */
+ if (inode->i_nlink && !ext3_can_truncate(inode))
+ goto bad_orphan;
+
if (NEXT_ORPHAN(inode) > max_ino)
goto bad_orphan;
brelse(bitmap_bh);
@@ -690,6 +698,7 @@ bad_orphan:
printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
NEXT_ORPHAN(inode));
printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
+ printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink);
/* Avoid freeing blocks if we got a bad deleted inode */
if (inode->i_nlink == 0)
inode->i_blocks = 0;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 6ae4ecf3ce4..ebfec4d0148 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -36,6 +36,7 @@
#include <linux/mpage.h>
#include <linux/uio.h>
#include <linux/bio.h>
+#include <linux/fiemap.h>
#include "xattr.h"
#include "acl.h"
@@ -981,6 +982,13 @@ out:
return ret;
}
+int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len)
+{
+ return generic_block_fiemap(inode, fieinfo, start, len,
+ ext3_get_block);
+}
+
/*
* `handle' can be NULL if create is zero
*/
@@ -1767,44 +1775,47 @@ static int ext3_journalled_set_page_dirty(struct page *page)
}
static const struct address_space_operations ext3_ordered_aops = {
- .readpage = ext3_readpage,
- .readpages = ext3_readpages,
- .writepage = ext3_ordered_writepage,
- .sync_page = block_sync_page,
- .write_begin = ext3_write_begin,
- .write_end = ext3_ordered_write_end,
- .bmap = ext3_bmap,
- .invalidatepage = ext3_invalidatepage,
- .releasepage = ext3_releasepage,
- .direct_IO = ext3_direct_IO,
- .migratepage = buffer_migrate_page,
+ .readpage = ext3_readpage,
+ .readpages = ext3_readpages,
+ .writepage = ext3_ordered_writepage,
+ .sync_page = block_sync_page,
+ .write_begin = ext3_write_begin,
+ .write_end = ext3_ordered_write_end,
+ .bmap = ext3_bmap,
+ .invalidatepage = ext3_invalidatepage,
+ .releasepage = ext3_releasepage,
+ .direct_IO = ext3_direct_IO,
+ .migratepage = buffer_migrate_page,
+ .is_partially_uptodate = block_is_partially_uptodate,
};
static const struct address_space_operations ext3_writeback_aops = {
- .readpage = ext3_readpage,
- .readpages = ext3_readpages,
- .writepage = ext3_writeback_writepage,
- .sync_page = block_sync_page,
- .write_begin = ext3_write_begin,
- .write_end = ext3_writeback_write_end,
- .bmap = ext3_bmap,
- .invalidatepage = ext3_invalidatepage,
- .releasepage = ext3_releasepage,
- .direct_IO = ext3_direct_IO,
- .migratepage = buffer_migrate_page,
+ .readpage = ext3_readpage,
+ .readpages = ext3_readpages,
+ .writepage = ext3_writeback_writepage,
+ .sync_page = block_sync_page,
+ .write_begin = ext3_write_begin,
+ .write_end = ext3_writeback_write_end,
+ .bmap = ext3_bmap,
+ .invalidatepage = ext3_invalidatepage,
+ .releasepage = ext3_releasepage,
+ .direct_IO = ext3_direct_IO,
+ .migratepage = buffer_migrate_page,
+ .is_partially_uptodate = block_is_partially_uptodate,
};
static const struct address_space_operations ext3_journalled_aops = {
- .readpage = ext3_readpage,
- .readpages = ext3_readpages,
- .writepage = ext3_journalled_writepage,
- .sync_page = block_sync_page,
- .write_begin = ext3_write_begin,
- .write_end = ext3_journalled_write_end,
- .set_page_dirty = ext3_journalled_set_page_dirty,
- .bmap = ext3_bmap,
- .invalidatepage = ext3_invalidatepage,
- .releasepage = ext3_releasepage,
+ .readpage = ext3_readpage,
+ .readpages = ext3_readpages,
+ .writepage = ext3_journalled_writepage,
+ .sync_page = block_sync_page,
+ .write_begin = ext3_write_begin,
+ .write_end = ext3_journalled_write_end,
+ .set_page_dirty = ext3_journalled_set_page_dirty,
+ .bmap = ext3_bmap,
+ .invalidatepage = ext3_invalidatepage,
+ .releasepage = ext3_releasepage,
+ .is_partially_uptodate = block_is_partially_uptodate,
};
void ext3_set_aops(struct inode *inode)
@@ -2127,7 +2138,21 @@ static void ext3_free_data(handle_t *handle, struct inode *inode,
if (this_bh) {
BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
- ext3_journal_dirty_metadata(handle, this_bh);
+
+ /*
+ * The buffer head should have an attached journal head at this
+ * point. However, if the data is corrupted and an indirect
+ * block pointed to itself, it would have been detached when
+ * the block was cleared. Check for this instead of OOPSing.
+ */
+ if (bh2jh(this_bh))
+ ext3_journal_dirty_metadata(handle, this_bh);
+ else
+ ext3_error(inode->i_sb, "ext3_free_data",
+ "circular indirect block detected, "
+ "inode=%lu, block=%llu",
+ inode->i_ino,
+ (unsigned long long)this_bh->b_blocknr);
}
}
@@ -2253,6 +2278,19 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
}
}
+int ext3_can_truncate(struct inode *inode)
+{
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ return 0;
+ if (S_ISREG(inode->i_mode))
+ return 1;
+ if (S_ISDIR(inode->i_mode))
+ return 1;
+ if (S_ISLNK(inode->i_mode))
+ return !ext3_inode_is_fast_symlink(inode);
+ return 0;
+}
+
/*
* ext3_truncate()
*
@@ -2297,12 +2335,7 @@ void ext3_truncate(struct inode *inode)
unsigned blocksize = inode->i_sb->s_blocksize;
struct page *page;
- if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- S_ISLNK(inode->i_mode)))
- return;
- if (ext3_inode_is_fast_symlink(inode))
- return;
- if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ if (!ext3_can_truncate(inode))
return;
/*
@@ -2513,6 +2546,16 @@ static int __ext3_get_inode_loc(struct inode *inode,
}
if (!buffer_uptodate(bh)) {
lock_buffer(bh);
+
+ /*
+ * If the buffer has the write error flag, we have failed
+ * to write out another inode in the same block. In this
+ * case, we don't have to read the block because we may
+ * read the old inode data successfully.
+ */
+ if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
+ set_buffer_uptodate(bh);
+
if (buffer_uptodate(bh)) {
/* someone brought it uptodate while we waited */
unlock_buffer(bh);
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 0b8cf80154f..de13e919cd8 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -240,13 +240,13 @@ static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
{
unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) -
EXT3_DIR_REC_LEN(2) - infosize;
- return 0? 20: entry_space / sizeof(struct dx_entry);
+ return entry_space / sizeof(struct dx_entry);
}
static inline unsigned dx_node_limit (struct inode *dir)
{
unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0);
- return 0? 22: entry_space / sizeof(struct dx_entry);
+ return entry_space / sizeof(struct dx_entry);
}
/*
@@ -991,19 +991,21 @@ static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
de = (struct ext3_dir_entry_2 *) bh->b_data;
top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize -
EXT3_DIR_REC_LEN(0));
- for (; de < top; de = ext3_next_entry(de))
- if (ext3_match (namelen, name, de)) {
- if (!ext3_check_dir_entry("ext3_find_entry",
- dir, de, bh,
- (block<<EXT3_BLOCK_SIZE_BITS(sb))
- +((char *)de - bh->b_data))) {
- brelse (bh);
+ for (; de < top; de = ext3_next_entry(de)) {
+ int off = (block << EXT3_BLOCK_SIZE_BITS(sb))
+ + ((char *) de - bh->b_data);
+
+ if (!ext3_check_dir_entry(__func__, dir, de, bh, off)) {
+ brelse(bh);
*err = ERR_BAD_DX_DIR;
goto errout;
}
- *res_dir = de;
- dx_release (frames);
- return bh;
+
+ if (ext3_match(namelen, name, de)) {
+ *res_dir = de;
+ dx_release(frames);
+ return bh;
+ }
}
brelse (bh);
/* Check to see if we should continue to search */
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index fe3119a71ad..399a96a6c55 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -472,7 +472,7 @@ static void ext3_destroy_inode(struct inode *inode)
kmem_cache_free(ext3_inode_cachep, EXT3_I(inode));
}
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
{
struct ext3_inode_info *ei = (struct ext3_inode_info *) foo;
@@ -760,7 +760,7 @@ enum {
Opt_grpquota
};
-static match_table_t tokens = {
+static const match_table_t tokens = {
{Opt_bsd_df, "bsddf"},
{Opt_minix_df, "minixdf"},
{Opt_grpid, "grpid"},
@@ -842,7 +842,7 @@ static int parse_options (char *options, struct super_block *sb,
int data_opt = 0;
int option;
#ifdef CONFIG_QUOTA
- int qtype;
+ int qtype, qfmt;
char *qname;
#endif
@@ -1018,9 +1018,11 @@ static int parse_options (char *options, struct super_block *sb,
case Opt_grpjquota:
qtype = GRPQUOTA;
set_qf_name:
- if (sb_any_quota_enabled(sb)) {
+ if ((sb_any_quota_enabled(sb) ||
+ sb_any_quota_suspended(sb)) &&
+ !sbi->s_qf_names[qtype]) {
printk(KERN_ERR
- "EXT3-fs: Cannot change journalled "
+ "EXT3-fs: Cannot change journaled "
"quota options when quota turned on.\n");
return 0;
}
@@ -1056,9 +1058,11 @@ set_qf_name:
case Opt_offgrpjquota:
qtype = GRPQUOTA;
clear_qf_name:
- if (sb_any_quota_enabled(sb)) {
+ if ((sb_any_quota_enabled(sb) ||
+ sb_any_quota_suspended(sb)) &&
+ sbi->s_qf_names[qtype]) {
printk(KERN_ERR "EXT3-fs: Cannot change "
- "journalled quota options when "
+ "journaled quota options when "
"quota turned on.\n");
return 0;
}
@@ -1069,10 +1073,20 @@ clear_qf_name:
sbi->s_qf_names[qtype] = NULL;
break;
case Opt_jqfmt_vfsold:
- sbi->s_jquota_fmt = QFMT_VFS_OLD;
- break;
+ qfmt = QFMT_VFS_OLD;
+ goto set_qf_format;
case Opt_jqfmt_vfsv0:
- sbi->s_jquota_fmt = QFMT_VFS_V0;
+ qfmt = QFMT_VFS_V0;
+set_qf_format:
+ if ((sb_any_quota_enabled(sb) ||
+ sb_any_quota_suspended(sb)) &&
+ sbi->s_jquota_fmt != qfmt) {
+ printk(KERN_ERR "EXT3-fs: Cannot change "
+ "journaled quota options when "
+ "quota turned on.\n");
+ return 0;
+ }
+ sbi->s_jquota_fmt = qfmt;
break;
case Opt_quota:
case Opt_usrquota:
@@ -1084,7 +1098,8 @@ clear_qf_name:
set_opt(sbi->s_mount_opt, GRPQUOTA);
break;
case Opt_noquota:
- if (sb_any_quota_enabled(sb)) {
+ if (sb_any_quota_enabled(sb) ||
+ sb_any_quota_suspended(sb)) {
printk(KERN_ERR "EXT3-fs: Cannot change quota "
"options when quota turned on.\n");
return 0;
@@ -1169,14 +1184,14 @@ clear_qf_name:
}
if (!sbi->s_jquota_fmt) {
- printk(KERN_ERR "EXT3-fs: journalled quota format "
+ printk(KERN_ERR "EXT3-fs: journaled quota format "
"not specified.\n");
return 0;
}
} else {
if (sbi->s_jquota_fmt) {
- printk(KERN_ERR "EXT3-fs: journalled quota format "
- "specified with no journalling "
+ printk(KERN_ERR "EXT3-fs: journaled quota format "
+ "specified with no journaling "
"enabled.\n");
return 0;
}
@@ -1370,7 +1385,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
int ret = ext3_quota_on_mount(sb, i);
if (ret < 0)
printk(KERN_ERR
- "EXT3-fs: Cannot turn on journalled "
+ "EXT3-fs: Cannot turn on journaled "
"quota: error %d\n", ret);
}
}
@@ -2712,7 +2727,7 @@ static int ext3_release_dquot(struct dquot *dquot)
static int ext3_mark_dquot_dirty(struct dquot *dquot)
{
- /* Are we journalling quotas? */
+ /* Are we journaling quotas? */
if (EXT3_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
EXT3_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
dquot_mark_dquot_dirty(dquot);
@@ -2759,25 +2774,45 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
if (!test_opt(sb, QUOTA))
return -EINVAL;
- /* Not journalling quota or remount? */
- if ((!EXT3_SB(sb)->s_qf_names[USRQUOTA] &&
- !EXT3_SB(sb)->s_qf_names[GRPQUOTA]) || remount)
+ /* When remounting, no checks are needed and in fact, path is NULL */
+ if (remount)
return vfs_quota_on(sb, type, format_id, path, remount);
+
err = path_lookup(path, LOOKUP_FOLLOW, &nd);
if (err)
return err;
+
/* Quotafile not on the same filesystem? */
if (nd.path.mnt->mnt_sb != sb) {
path_put(&nd.path);
return -EXDEV;
}
- /* Quotafile not in fs root? */
- if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode)
- printk(KERN_WARNING
- "EXT3-fs: Quota file not on filesystem root. "
- "Journalled quota will not work.\n");
+ /* Journaling quota? */
+ if (EXT3_SB(sb)->s_qf_names[type]) {
+ /* Quotafile not of fs root? */
+ if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode)
+ printk(KERN_WARNING
+ "EXT3-fs: Quota file not on filesystem root. "
+ "Journaled quota will not work.\n");
+ }
+
+ /*
+ * When we journal data on quota file, we have to flush journal to see
+ * all updates to the file when we bypass pagecache...
+ */
+ if (ext3_should_journal_data(nd.path.dentry->d_inode)) {
+ /*
+ * We don't need to lock updates but journal_flush() could
+ * otherwise be livelocked...
+ */
+ journal_lock_updates(EXT3_SB(sb)->s_journal);
+ journal_flush(EXT3_SB(sb)->s_journal);
+ journal_unlock_updates(EXT3_SB(sb)->s_journal);
+ }
+
+ err = vfs_quota_on_path(sb, type, format_id, &nd.path);
path_put(&nd.path);
- return vfs_quota_on(sb, type, format_id, path, remount);
+ return err;
}
/* Read data from quotafile - avoid pagecache and such because we cannot afford
@@ -2875,8 +2910,10 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
blk++;
}
out:
- if (len == towrite)
+ if (len == towrite) {
+ mutex_unlock(&inode->i_mutex);
return err;
+ }
if (inode->i_size < off+len-towrite) {
i_size_write(inode, off+len-towrite);
EXT3_I(inode)->i_disksize = inode->i_size;
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index 821efaf2b94..37b81097bdf 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -15,7 +15,7 @@ static size_t
ext3_xattr_security_list(struct inode *inode, char *list, size_t list_size,
const char *name, size_t name_len)
{
- const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1;
+ const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c
index 0327497a55c..c7c41a410c4 100644
--- a/fs/ext3/xattr_trusted.c
+++ b/fs/ext3/xattr_trusted.c
@@ -13,13 +13,11 @@
#include <linux/ext3_fs.h>
#include "xattr.h"
-#define XATTR_TRUSTED_PREFIX "trusted."
-
static size_t
ext3_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
const char *name, size_t name_len)
{
- const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
+ const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
if (!capable(CAP_SYS_ADMIN))
diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c
index 1abd8f92c44..430fe63b31b 100644
--- a/fs/ext3/xattr_user.c
+++ b/fs/ext3/xattr_user.c
@@ -12,13 +12,11 @@
#include <linux/ext3_fs.h>
#include "xattr.h"
-#define XATTR_USER_PREFIX "user."
-
static size_t
ext3_xattr_user_list(struct inode *inode, char *list, size_t list_size,
const char *name, size_t name_len)
{
- const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1;
+ const size_t prefix_len = XATTR_USER_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
if (!test_opt(inode->i_sb, XATTR_USER))
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index ac6fa8ca0a2..a8ff003a00f 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -2,12 +2,12 @@
# Makefile for the linux ext4-filesystem routines.
#
-obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o
+obj-$(CONFIG_EXT4_FS) += ext4.o
-ext4dev-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
ext4_jbd2.o migrate.o mballoc.o
-ext4dev-$(CONFIG_EXT4DEV_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
-ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL) += acl.o
-ext4dev-$(CONFIG_EXT4DEV_FS_SECURITY) += xattr_security.o
+ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
+ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
+ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 3c8dab880d9..694ed6fadcc 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -40,34 +40,35 @@ ext4_acl_from_disk(const void *value, size_t size)
acl = posix_acl_alloc(count, GFP_NOFS);
if (!acl)
return ERR_PTR(-ENOMEM);
- for (n=0; n < count; n++) {
+ for (n = 0; n < count; n++) {
ext4_acl_entry *entry =
(ext4_acl_entry *)value;
if ((char *)value + sizeof(ext4_acl_entry_short) > end)
goto fail;
acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag);
acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
- switch(acl->a_entries[n].e_tag) {
- case ACL_USER_OBJ:
- case ACL_GROUP_OBJ:
- case ACL_MASK:
- case ACL_OTHER:
- value = (char *)value +
- sizeof(ext4_acl_entry_short);
- acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
- break;
-
- case ACL_USER:
- case ACL_GROUP:
- value = (char *)value + sizeof(ext4_acl_entry);
- if ((char *)value > end)
- goto fail;
- acl->a_entries[n].e_id =
- le32_to_cpu(entry->e_id);
- break;
-
- default:
+
+ switch (acl->a_entries[n].e_tag) {
+ case ACL_USER_OBJ:
+ case ACL_GROUP_OBJ:
+ case ACL_MASK:
+ case ACL_OTHER:
+ value = (char *)value +
+ sizeof(ext4_acl_entry_short);
+ acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
+ break;
+
+ case ACL_USER:
+ case ACL_GROUP:
+ value = (char *)value + sizeof(ext4_acl_entry);
+ if ((char *)value > end)
goto fail;
+ acl->a_entries[n].e_id =
+ le32_to_cpu(entry->e_id);
+ break;
+
+ default:
+ goto fail;
}
}
if (value != end)
@@ -96,27 +97,26 @@ ext4_acl_to_disk(const struct posix_acl *acl, size_t *size)
return ERR_PTR(-ENOMEM);
ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION);
e = (char *)ext_acl + sizeof(ext4_acl_header);
- for (n=0; n < acl->a_count; n++) {
+ for (n = 0; n < acl->a_count; n++) {
ext4_acl_entry *entry = (ext4_acl_entry *)e;
entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag);
entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
- switch(acl->a_entries[n].e_tag) {
- case ACL_USER:
- case ACL_GROUP:
- entry->e_id =
- cpu_to_le32(acl->a_entries[n].e_id);
- e += sizeof(ext4_acl_entry);
- break;
-
- case ACL_USER_OBJ:
- case ACL_GROUP_OBJ:
- case ACL_MASK:
- case ACL_OTHER:
- e += sizeof(ext4_acl_entry_short);
- break;
-
- default:
- goto fail;
+ switch (acl->a_entries[n].e_tag) {
+ case ACL_USER:
+ case ACL_GROUP:
+ entry->e_id = cpu_to_le32(acl->a_entries[n].e_id);
+ e += sizeof(ext4_acl_entry);
+ break;
+
+ case ACL_USER_OBJ:
+ case ACL_GROUP_OBJ:
+ case ACL_MASK:
+ case ACL_OTHER:
+ e += sizeof(ext4_acl_entry_short);
+ break;
+
+ default:
+ goto fail;
}
}
return (char *)ext_acl;
@@ -167,23 +167,23 @@ ext4_get_acl(struct inode *inode, int type)
if (!test_opt(inode->i_sb, POSIX_ACL))
return NULL;
- switch(type) {
- case ACL_TYPE_ACCESS:
- acl = ext4_iget_acl(inode, &ei->i_acl);
- if (acl != EXT4_ACL_NOT_CACHED)
- return acl;
- name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
- break;
-
- case ACL_TYPE_DEFAULT:
- acl = ext4_iget_acl(inode, &ei->i_default_acl);
- if (acl != EXT4_ACL_NOT_CACHED)
- return acl;
- name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
- break;
-
- default:
- return ERR_PTR(-EINVAL);
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ acl = ext4_iget_acl(inode, &ei->i_acl);
+ if (acl != EXT4_ACL_NOT_CACHED)
+ return acl;
+ name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
+ break;
+
+ case ACL_TYPE_DEFAULT:
+ acl = ext4_iget_acl(inode, &ei->i_default_acl);
+ if (acl != EXT4_ACL_NOT_CACHED)
+ return acl;
+ name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
+ break;
+
+ default:
+ return ERR_PTR(-EINVAL);
}
retval = ext4_xattr_get(inode, name_index, "", NULL, 0);
if (retval > 0) {
@@ -201,14 +201,14 @@ ext4_get_acl(struct inode *inode, int type)
kfree(value);
if (!IS_ERR(acl)) {
- switch(type) {
- case ACL_TYPE_ACCESS:
- ext4_iset_acl(inode, &ei->i_acl, acl);
- break;
-
- case ACL_TYPE_DEFAULT:
- ext4_iset_acl(inode, &ei->i_default_acl, acl);
- break;
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ ext4_iset_acl(inode, &ei->i_acl, acl);
+ break;
+
+ case ACL_TYPE_DEFAULT:
+ ext4_iset_acl(inode, &ei->i_default_acl, acl);
+ break;
}
}
return acl;
@@ -232,31 +232,31 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
if (S_ISLNK(inode->i_mode))
return -EOPNOTSUPP;
- switch(type) {
- case ACL_TYPE_ACCESS:
- name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
- if (acl) {
- mode_t mode = inode->i_mode;
- error = posix_acl_equiv_mode(acl, &mode);
- if (error < 0)
- return error;
- else {
- inode->i_mode = mode;
- ext4_mark_inode_dirty(handle, inode);
- if (error == 0)
- acl = NULL;
- }
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
+ if (acl) {
+ mode_t mode = inode->i_mode;
+ error = posix_acl_equiv_mode(acl, &mode);
+ if (error < 0)
+ return error;
+ else {
+ inode->i_mode = mode;
+ ext4_mark_inode_dirty(handle, inode);
+ if (error == 0)
+ acl = NULL;
}
- break;
+ }
+ break;
- case ACL_TYPE_DEFAULT:
- name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
- if (!S_ISDIR(inode->i_mode))
- return acl ? -EACCES : 0;
- break;
+ case ACL_TYPE_DEFAULT:
+ name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
+ if (!S_ISDIR(inode->i_mode))
+ return acl ? -EACCES : 0;
+ break;
- default:
- return -EINVAL;
+ default:
+ return -EINVAL;
}
if (acl) {
value = ext4_acl_to_disk(acl, &size);
@@ -269,14 +269,14 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
kfree(value);
if (!error) {
- switch(type) {
- case ACL_TYPE_ACCESS:
- ext4_iset_acl(inode, &ei->i_acl, acl);
- break;
-
- case ACL_TYPE_DEFAULT:
- ext4_iset_acl(inode, &ei->i_default_acl, acl);
- break;
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ ext4_iset_acl(inode, &ei->i_acl, acl);
+ break;
+
+ case ACL_TYPE_DEFAULT:
+ ext4_iset_acl(inode, &ei->i_default_acl, acl);
+ break;
}
}
return error;
@@ -299,7 +299,7 @@ ext4_check_acl(struct inode *inode, int mask)
}
int
-ext4_permission(struct inode *inode, int mask, struct nameidata *nd)
+ext4_permission(struct inode *inode, int mask)
{
return generic_permission(inode, mask, ext4_check_acl);
}
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 26a5c1abf14..cb45257a246 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -51,18 +51,18 @@ static inline int ext4_acl_count(size_t size)
}
}
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
/* Value for inode->u.ext4_i.i_acl and inode->u.ext4_i.i_default_acl
if the ACL has not been cached */
#define EXT4_ACL_NOT_CACHED ((void *)-1)
/* acl.c */
-extern int ext4_permission (struct inode *, int, struct nameidata *);
-extern int ext4_acl_chmod (struct inode *);
-extern int ext4_init_acl (handle_t *, struct inode *, struct inode *);
+extern int ext4_permission(struct inode *, int);
+extern int ext4_acl_chmod(struct inode *);
+extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
-#else /* CONFIG_EXT4DEV_FS_POSIX_ACL */
+#else /* CONFIG_EXT4_FS_POSIX_ACL */
#include <linux/sched.h>
#define ext4_permission NULL
@@ -77,5 +77,5 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
{
return 0;
}
-#endif /* CONFIG_EXT4DEV_FS_POSIX_ACL */
+#endif /* CONFIG_EXT4_FS_POSIX_ACL */
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 9cc80b9cc8d..bd2ece22882 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -47,7 +47,7 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
ext4_group_t block_group)
{
ext4_group_t actual_group;
- ext4_get_group_no_and_offset(sb, block, &actual_group, 0);
+ ext4_get_group_no_and_offset(sb, block, &actual_group, NULL);
if (actual_group == block_group)
return 1;
return 0;
@@ -83,6 +83,7 @@ static int ext4_group_used_meta_blocks(struct super_block *sb,
}
return used_blocks;
}
+
/* Initializes an uninitialized block bitmap if given, and returns the
* number of blocks free in the group. */
unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
@@ -121,12 +122,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
}
} else { /* For META_BG_BLOCK_GROUPS */
- int group_rel = (block_group -
- le32_to_cpu(sbi->s_es->s_first_meta_bg)) %
- EXT4_DESC_PER_BLOCK(sb);
- if (group_rel == 0 || group_rel == 1 ||
- (group_rel == EXT4_DESC_PER_BLOCK(sb) - 1))
- bit_max += 1;
+ bit_max += ext4_bg_num_gdb(sb, block_group);
}
if (block_group == sbi->s_groups_count - 1) {
@@ -137,7 +133,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
*/
group_blocks = ext4_blocks_count(sbi->s_es) -
le32_to_cpu(sbi->s_es->s_first_data_block) -
- (EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count -1));
+ (EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count - 1));
} else {
group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
}
@@ -205,20 +201,20 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
* @bh: pointer to the buffer head to store the block
* group descriptor
*/
-struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
+struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
ext4_group_t block_group,
- struct buffer_head ** bh)
+ struct buffer_head **bh)
{
unsigned long group_desc;
unsigned long offset;
- struct ext4_group_desc * desc;
+ struct ext4_group_desc *desc;
struct ext4_sb_info *sbi = EXT4_SB(sb);
if (block_group >= sbi->s_groups_count) {
- ext4_error (sb, "ext4_get_group_desc",
- "block_group >= groups_count - "
- "block_group = %lu, groups_count = %lu",
- block_group, sbi->s_groups_count);
+ ext4_error(sb, "ext4_get_group_desc",
+ "block_group >= groups_count - "
+ "block_group = %lu, groups_count = %lu",
+ block_group, sbi->s_groups_count);
return NULL;
}
@@ -227,10 +223,10 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
if (!sbi->s_group_desc[group_desc]) {
- ext4_error (sb, "ext4_get_group_desc",
- "Group descriptor not loaded - "
- "block_group = %lu, group_desc = %lu, desc = %lu",
- block_group, group_desc, offset);
+ ext4_error(sb, "ext4_get_group_desc",
+ "Group descriptor not loaded - "
+ "block_group = %lu, group_desc = %lu, desc = %lu",
+ block_group, group_desc, offset);
return NULL;
}
@@ -295,7 +291,7 @@ err_out:
return 0;
}
/**
- * read_block_bitmap()
+ * ext4_read_block_bitmap()
* @sb: super block
* @block_group: given block group
*
@@ -305,10 +301,10 @@ err_out:
* Return buffer_head on success or NULL in case of failure.
*/
struct buffer_head *
-read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
+ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
{
- struct ext4_group_desc * desc;
- struct buffer_head * bh = NULL;
+ struct ext4_group_desc *desc;
+ struct buffer_head *bh = NULL;
ext4_fsblk_t bitmap_blk;
desc = ext4_get_group_desc(sb, block_group, NULL);
@@ -319,25 +315,30 @@ read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
if (unlikely(!bh)) {
ext4_error(sb, __func__,
"Cannot read block bitmap - "
- "block_group = %d, block_bitmap = %llu",
- (int)block_group, (unsigned long long)bitmap_blk);
+ "block_group = %lu, block_bitmap = %llu",
+ block_group, bitmap_blk);
return NULL;
}
- if (bh_uptodate_or_lock(bh))
+ if (buffer_uptodate(bh) &&
+ !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
return bh;
+ lock_buffer(bh);
+ spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
ext4_init_block_bitmap(sb, bh, block_group, desc);
set_buffer_uptodate(bh);
unlock_buffer(bh);
+ spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
return bh;
}
+ spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
if (bh_submit_read(bh) < 0) {
put_bh(bh);
ext4_error(sb, __func__,
"Cannot read block bitmap - "
- "block_group = %d, block_bitmap = %llu",
- (int)block_group, (unsigned long long)bitmap_blk);
+ "block_group = %lu, block_bitmap = %llu",
+ block_group, bitmap_blk);
return NULL;
}
ext4_valid_block_bitmap(sb, desc, block_group, bh);
@@ -347,302 +348,6 @@ read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
*/
return bh;
}
-/*
- * The reservation window structure operations
- * --------------------------------------------
- * Operations include:
- * dump, find, add, remove, is_empty, find_next_reservable_window, etc.
- *
- * We use a red-black tree to represent per-filesystem reservation
- * windows.
- *
- */
-
-/**
- * __rsv_window_dump() -- Dump the filesystem block allocation reservation map
- * @rb_root: root of per-filesystem reservation rb tree
- * @verbose: verbose mode
- * @fn: function which wishes to dump the reservation map
- *
- * If verbose is turned on, it will print the whole block reservation
- * windows(start, end). Otherwise, it will only print out the "bad" windows,
- * those windows that overlap with their immediate neighbors.
- */
-#if 1
-static void __rsv_window_dump(struct rb_root *root, int verbose,
- const char *fn)
-{
- struct rb_node *n;
- struct ext4_reserve_window_node *rsv, *prev;
- int bad;
-
-restart:
- n = rb_first(root);
- bad = 0;
- prev = NULL;
-
- printk("Block Allocation Reservation Windows Map (%s):\n", fn);
- while (n) {
- rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
- if (verbose)
- printk("reservation window 0x%p "
- "start: %llu, end: %llu\n",
- rsv, rsv->rsv_start, rsv->rsv_end);
- if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) {
- printk("Bad reservation %p (start >= end)\n",
- rsv);
- bad = 1;
- }
- if (prev && prev->rsv_end >= rsv->rsv_start) {
- printk("Bad reservation %p (prev->end >= start)\n",
- rsv);
- bad = 1;
- }
- if (bad) {
- if (!verbose) {
- printk("Restarting reservation walk in verbose mode\n");
- verbose = 1;
- goto restart;
- }
- }
- n = rb_next(n);
- prev = rsv;
- }
- printk("Window map complete.\n");
- if (bad)
- BUG();
-}
-#define rsv_window_dump(root, verbose) \
- __rsv_window_dump((root), (verbose), __func__)
-#else
-#define rsv_window_dump(root, verbose) do {} while (0)
-#endif
-
-/**
- * goal_in_my_reservation()
- * @rsv: inode's reservation window
- * @grp_goal: given goal block relative to the allocation block group
- * @group: the current allocation block group
- * @sb: filesystem super block
- *
- * Test if the given goal block (group relative) is within the file's
- * own block reservation window range.
- *
- * If the reservation window is outside the goal allocation group, return 0;
- * grp_goal (given goal block) could be -1, which means no specific
- * goal block. In this case, always return 1.
- * If the goal block is within the reservation window, return 1;
- * otherwise, return 0;
- */
-static int
-goal_in_my_reservation(struct ext4_reserve_window *rsv, ext4_grpblk_t grp_goal,
- ext4_group_t group, struct super_block *sb)
-{
- ext4_fsblk_t group_first_block, group_last_block;
-
- group_first_block = ext4_group_first_block_no(sb, group);
- group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
-
- if ((rsv->_rsv_start > group_last_block) ||
- (rsv->_rsv_end < group_first_block))
- return 0;
- if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start)
- || (grp_goal + group_first_block > rsv->_rsv_end)))
- return 0;
- return 1;
-}
-
-/**
- * search_reserve_window()
- * @rb_root: root of reservation tree
- * @goal: target allocation block
- *
- * Find the reserved window which includes the goal, or the previous one
- * if the goal is not in any window.
- * Returns NULL if there are no windows or if all windows start after the goal.
- */
-static struct ext4_reserve_window_node *
-search_reserve_window(struct rb_root *root, ext4_fsblk_t goal)
-{
- struct rb_node *n = root->rb_node;
- struct ext4_reserve_window_node *rsv;
-
- if (!n)
- return NULL;
-
- do {
- rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
-
- if (goal < rsv->rsv_start)
- n = n->rb_left;
- else if (goal > rsv->rsv_end)
- n = n->rb_right;
- else
- return rsv;
- } while (n);
- /*
- * We've fallen off the end of the tree: the goal wasn't inside
- * any particular node. OK, the previous node must be to one
- * side of the interval containing the goal. If it's the RHS,
- * we need to back up one.
- */
- if (rsv->rsv_start > goal) {
- n = rb_prev(&rsv->rsv_node);
- rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node);
- }
- return rsv;
-}
-
-/**
- * ext4_rsv_window_add() -- Insert a window to the block reservation rb tree.
- * @sb: super block
- * @rsv: reservation window to add
- *
- * Must be called with rsv_lock hold.
- */
-void ext4_rsv_window_add(struct super_block *sb,
- struct ext4_reserve_window_node *rsv)
-{
- struct rb_root *root = &EXT4_SB(sb)->s_rsv_window_root;
- struct rb_node *node = &rsv->rsv_node;
- ext4_fsblk_t start = rsv->rsv_start;
-
- struct rb_node ** p = &root->rb_node;
- struct rb_node * parent = NULL;
- struct ext4_reserve_window_node *this;
-
- while (*p)
- {
- parent = *p;
- this = rb_entry(parent, struct ext4_reserve_window_node, rsv_node);
-
- if (start < this->rsv_start)
- p = &(*p)->rb_left;
- else if (start > this->rsv_end)
- p = &(*p)->rb_right;
- else {
- rsv_window_dump(root, 1);
- BUG();
- }
- }
-
- rb_link_node(node, parent, p);
- rb_insert_color(node, root);
-}
-
-/**
- * ext4_rsv_window_remove() -- unlink a window from the reservation rb tree
- * @sb: super block
- * @rsv: reservation window to remove
- *
- * Mark the block reservation window as not allocated, and unlink it
- * from the filesystem reservation window rb tree. Must be called with
- * rsv_lock hold.
- */
-static void rsv_window_remove(struct super_block *sb,
- struct ext4_reserve_window_node *rsv)
-{
- rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
- rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
- rsv->rsv_alloc_hit = 0;
- rb_erase(&rsv->rsv_node, &EXT4_SB(sb)->s_rsv_window_root);
-}
-
-/*
- * rsv_is_empty() -- Check if the reservation window is allocated.
- * @rsv: given reservation window to check
- *
- * returns 1 if the end block is EXT4_RESERVE_WINDOW_NOT_ALLOCATED.
- */
-static inline int rsv_is_empty(struct ext4_reserve_window *rsv)
-{
- /* a valid reservation end block could not be 0 */
- return rsv->_rsv_end == EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
-}
-
-/**
- * ext4_init_block_alloc_info()
- * @inode: file inode structure
- *
- * Allocate and initialize the reservation window structure, and
- * link the window to the ext4 inode structure at last
- *
- * The reservation window structure is only dynamically allocated
- * and linked to ext4 inode the first time the open file
- * needs a new block. So, before every ext4_new_block(s) call, for
- * regular files, we should check whether the reservation window
- * structure exists or not. In the latter case, this function is called.
- * Fail to do so will result in block reservation being turned off for that
- * open file.
- *
- * This function is called from ext4_get_blocks_handle(), also called
- * when setting the reservation window size through ioctl before the file
- * is open for write (needs block allocation).
- *
- * Needs down_write(i_data_sem) protection prior to call this function.
- */
-void ext4_init_block_alloc_info(struct inode *inode)
-{
- struct ext4_inode_info *ei = EXT4_I(inode);
- struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info;
- struct super_block *sb = inode->i_sb;
-
- block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
- if (block_i) {
- struct ext4_reserve_window_node *rsv = &block_i->rsv_window_node;
-
- rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
- rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
-
- /*
- * if filesystem is mounted with NORESERVATION, the goal
- * reservation window size is set to zero to indicate
- * block reservation is off
- */
- if (!test_opt(sb, RESERVATION))
- rsv->rsv_goal_size = 0;
- else
- rsv->rsv_goal_size = EXT4_DEFAULT_RESERVE_BLOCKS;
- rsv->rsv_alloc_hit = 0;
- block_i->last_alloc_logical_block = 0;
- block_i->last_alloc_physical_block = 0;
- }
- ei->i_block_alloc_info = block_i;
-}
-
-/**
- * ext4_discard_reservation()
- * @inode: inode
- *
- * Discard(free) block reservation window on last file close, or truncate
- * or at last iput().
- *
- * It is being called in three cases:
- * ext4_release_file(): last writer close the file
- * ext4_clear_inode(): last iput(), when nobody link to this file.
- * ext4_truncate(): when the block indirect map is about to change.
- *
- */
-void ext4_discard_reservation(struct inode *inode)
-{
- struct ext4_inode_info *ei = EXT4_I(inode);
- struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info;
- struct ext4_reserve_window_node *rsv;
- spinlock_t *rsv_lock = &EXT4_SB(inode->i_sb)->s_rsv_window_lock;
-
- ext4_mb_discard_inode_preallocations(inode);
-
- if (!block_i)
- return;
-
- rsv = &block_i->rsv_window_node;
- if (!rsv_is_empty(&rsv->rsv_window)) {
- spin_lock(rsv_lock);
- if (!rsv_is_empty(&rsv->rsv_window))
- rsv_window_remove(inode->i_sb, rsv);
- spin_unlock(rsv_lock);
- }
-}
/**
* ext4_free_blocks_sb() -- Free given blocks and update quota
@@ -651,6 +356,13 @@ void ext4_discard_reservation(struct inode *inode)
* @block: start physcial block to free
* @count: number of blocks to free
* @pdquot_freed_blocks: pointer to quota
+ *
+ * XXX This function is only used by the on-line resizing code, which
+ * should probably be fixed up to call the mballoc variant. There
+ * this needs to be cleaned up later; in fact, I'm not convinced this
+ * is 100% correct in the face of the mballoc code. The online resizing
+ * code needs to be fixed up to more tightly (and correctly) interlock
+ * with the mballoc code.
*/
void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
ext4_fsblk_t block, unsigned long count,
@@ -662,8 +374,8 @@ void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
ext4_grpblk_t bit;
unsigned long i;
unsigned long overflow;
- struct ext4_group_desc * desc;
- struct ext4_super_block * es;
+ struct ext4_group_desc *desc;
+ struct ext4_super_block *es;
struct ext4_sb_info *sbi;
int err = 0, ret;
ext4_grpblk_t group_freed;
@@ -674,13 +386,13 @@ void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
if (block < le32_to_cpu(es->s_first_data_block) ||
block + count < block ||
block + count > ext4_blocks_count(es)) {
- ext4_error (sb, "ext4_free_blocks",
- "Freeing blocks not in datazone - "
- "block = %llu, count = %lu", block, count);
+ ext4_error(sb, "ext4_free_blocks",
+ "Freeing blocks not in datazone - "
+ "block = %llu, count = %lu", block, count);
goto error_return;
}
- ext4_debug ("freeing block(s) %llu-%llu\n", block, block + count - 1);
+ ext4_debug("freeing block(s) %llu-%llu\n", block, block + count - 1);
do_more:
overflow = 0;
@@ -694,10 +406,10 @@ do_more:
count -= overflow;
}
brelse(bitmap_bh);
- bitmap_bh = read_block_bitmap(sb, block_group);
+ bitmap_bh = ext4_read_block_bitmap(sb, block_group);
if (!bitmap_bh)
goto error_return;
- desc = ext4_get_group_desc (sb, block_group, &gd_bh);
+ desc = ext4_get_group_desc(sb, block_group, &gd_bh);
if (!desc)
goto error_return;
@@ -706,10 +418,10 @@ do_more:
in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
in_range(block + count - 1, ext4_inode_table(sb, desc),
sbi->s_itb_per_group)) {
- ext4_error (sb, "ext4_free_blocks",
- "Freeing blocks in system zones - "
- "Block = %llu, count = %lu",
- block, count);
+ ext4_error(sb, "ext4_free_blocks",
+ "Freeing blocks in system zones - "
+ "Block = %llu, count = %lu",
+ block, count);
goto error_return;
}
@@ -810,6 +522,13 @@ do_more:
spin_unlock(sb_bgl_lock(sbi, block_group));
percpu_counter_add(&sbi->s_freeblocks_counter, count);
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_blocks += count;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
+
/* We dirtied the bitmap block */
BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
err = ext4_journal_dirty_metadata(handle, bitmap_bh);
@@ -844,7 +563,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t block, unsigned long count,
int metadata)
{
- struct super_block * sb;
+ struct super_block *sb;
unsigned long dquot_freed_blocks;
/* this isn't the right place to decide whether block is metadata
@@ -855,766 +574,90 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
sb = inode->i_sb;
- if (!test_opt(sb, MBALLOC) || !EXT4_SB(sb)->s_group_info)
- ext4_free_blocks_sb(handle, sb, block, count,
- &dquot_freed_blocks);
- else
- ext4_mb_free_blocks(handle, inode, block, count,
- metadata, &dquot_freed_blocks);
+ ext4_mb_free_blocks(handle, inode, block, count,
+ metadata, &dquot_freed_blocks);
if (dquot_freed_blocks)
DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
return;
}
-/**
- * ext4_test_allocatable()
- * @nr: given allocation block group
- * @bh: bufferhead contains the bitmap of the given block group
- *
- * For ext4 allocations, we must not reuse any blocks which are
- * allocated in the bitmap buffer's "last committed data" copy. This
- * prevents deletes from freeing up the page for reuse until we have
- * committed the delete transaction.
- *
- * If we didn't do this, then deleting something and reallocating it as
- * data would allow the old block to be overwritten before the
- * transaction committed (because we force data to disk before commit).
- * This would lead to corruption if we crashed between overwriting the
- * data and committing the delete.
- *
- * @@@ We may want to make this allocation behaviour conditional on
- * data-writes at some point, and disable it for metadata allocations or
- * sync-data inodes.
- */
-static int ext4_test_allocatable(ext4_grpblk_t nr, struct buffer_head *bh)
-{
- int ret;
- struct journal_head *jh = bh2jh(bh);
-
- if (ext4_test_bit(nr, bh->b_data))
- return 0;
-
- jbd_lock_bh_state(bh);
- if (!jh->b_committed_data)
- ret = 1;
- else
- ret = !ext4_test_bit(nr, jh->b_committed_data);
- jbd_unlock_bh_state(bh);
- return ret;
-}
-
-/**
- * bitmap_search_next_usable_block()
- * @start: the starting block (group relative) of the search
- * @bh: bufferhead contains the block group bitmap
- * @maxblocks: the ending block (group relative) of the reservation
- *
- * The bitmap search --- search forward alternately through the actual
- * bitmap on disk and the last-committed copy in journal, until we find a
- * bit free in both bitmaps.
- */
-static ext4_grpblk_t
-bitmap_search_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh,
- ext4_grpblk_t maxblocks)
-{
- ext4_grpblk_t next;
- struct journal_head *jh = bh2jh(bh);
-
- while (start < maxblocks) {
- next = ext4_find_next_zero_bit(bh->b_data, maxblocks, start);
- if (next >= maxblocks)
- return -1;
- if (ext4_test_allocatable(next, bh))
- return next;
- jbd_lock_bh_state(bh);
- if (jh->b_committed_data)
- start = ext4_find_next_zero_bit(jh->b_committed_data,
- maxblocks, next);
- jbd_unlock_bh_state(bh);
- }
- return -1;
-}
-
-/**
- * find_next_usable_block()
- * @start: the starting block (group relative) to find next
- * allocatable block in bitmap.
- * @bh: bufferhead contains the block group bitmap
- * @maxblocks: the ending block (group relative) for the search
- *
- * Find an allocatable block in a bitmap. We honor both the bitmap and
- * its last-committed copy (if that exists), and perform the "most
- * appropriate allocation" algorithm of looking for a free block near
- * the initial goal; then for a free byte somewhere in the bitmap; then
- * for any free bit in the bitmap.
- */
-static ext4_grpblk_t
-find_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh,
- ext4_grpblk_t maxblocks)
-{
- ext4_grpblk_t here, next;
- char *p, *r;
-
- if (start > 0) {
- /*
- * The goal was occupied; search forward for a free
- * block within the next XX blocks.
- *
- * end_goal is more or less random, but it has to be
- * less than EXT4_BLOCKS_PER_GROUP. Aligning up to the
- * next 64-bit boundary is simple..
- */
- ext4_grpblk_t end_goal = (start + 63) & ~63;
- if (end_goal > maxblocks)
- end_goal = maxblocks;
- here = ext4_find_next_zero_bit(bh->b_data, end_goal, start);
- if (here < end_goal && ext4_test_allocatable(here, bh))
- return here;
- ext4_debug("Bit not found near goal\n");
- }
-
- here = start;
- if (here < 0)
- here = 0;
-
- p = ((char *)bh->b_data) + (here >> 3);
- r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3));
- next = (r - ((char *)bh->b_data)) << 3;
-
- if (next < maxblocks && next >= start && ext4_test_allocatable(next, bh))
- return next;
-
- /*
- * The bitmap search --- search forward alternately through the actual
- * bitmap and the last-committed copy until we find a bit free in
- * both
- */
- here = bitmap_search_next_usable_block(here, bh, maxblocks);
- return here;
-}
-
-/**
- * claim_block()
- * @block: the free block (group relative) to allocate
- * @bh: the bufferhead containts the block group bitmap
- *
- * We think we can allocate this block in this bitmap. Try to set the bit.
- * If that succeeds then check that nobody has allocated and then freed the
- * block since we saw that is was not marked in b_committed_data. If it _was_
- * allocated and freed then clear the bit in the bitmap again and return
- * zero (failure).
- */
-static inline int
-claim_block(spinlock_t *lock, ext4_grpblk_t block, struct buffer_head *bh)
-{
- struct journal_head *jh = bh2jh(bh);
- int ret;
-
- if (ext4_set_bit_atomic(lock, block, bh->b_data))
- return 0;
- jbd_lock_bh_state(bh);
- if (jh->b_committed_data && ext4_test_bit(block,jh->b_committed_data)) {
- ext4_clear_bit_atomic(lock, block, bh->b_data);
- ret = 0;
- } else {
- ret = 1;
- }
- jbd_unlock_bh_state(bh);
- return ret;
-}
-
-/**
- * ext4_try_to_allocate()
- * @sb: superblock
- * @handle: handle to this transaction
- * @group: given allocation block group
- * @bitmap_bh: bufferhead holds the block bitmap
- * @grp_goal: given target block within the group
- * @count: target number of blocks to allocate
- * @my_rsv: reservation window
- *
- * Attempt to allocate blocks within a give range. Set the range of allocation
- * first, then find the first free bit(s) from the bitmap (within the range),
- * and at last, allocate the blocks by claiming the found free bit as allocated.
- *
- * To set the range of this allocation:
- * if there is a reservation window, only try to allocate block(s) from the
- * file's own reservation window;
- * Otherwise, the allocation range starts from the give goal block, ends at
- * the block group's last block.
- *
- * If we failed to allocate the desired block then we may end up crossing to a
- * new bitmap. In that case we must release write access to the old one via
- * ext4_journal_release_buffer(), else we'll run out of credits.
- */
-static ext4_grpblk_t
-ext4_try_to_allocate(struct super_block *sb, handle_t *handle,
- ext4_group_t group, struct buffer_head *bitmap_bh,
- ext4_grpblk_t grp_goal, unsigned long *count,
- struct ext4_reserve_window *my_rsv)
+int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
+ s64 nblocks)
{
- ext4_fsblk_t group_first_block;
- ext4_grpblk_t start, end;
- unsigned long num = 0;
-
- /* we do allocation within the reservation window if we have a window */
- if (my_rsv) {
- group_first_block = ext4_group_first_block_no(sb, group);
- if (my_rsv->_rsv_start >= group_first_block)
- start = my_rsv->_rsv_start - group_first_block;
- else
- /* reservation window cross group boundary */
- start = 0;
- end = my_rsv->_rsv_end - group_first_block + 1;
- if (end > EXT4_BLOCKS_PER_GROUP(sb))
- /* reservation window crosses group boundary */
- end = EXT4_BLOCKS_PER_GROUP(sb);
- if ((start <= grp_goal) && (grp_goal < end))
- start = grp_goal;
- else
- grp_goal = -1;
- } else {
- if (grp_goal > 0)
- start = grp_goal;
- else
- start = 0;
- end = EXT4_BLOCKS_PER_GROUP(sb);
- }
+ s64 free_blocks, dirty_blocks;
+ s64 root_blocks = 0;
+ struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
+ struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
- BUG_ON(start > EXT4_BLOCKS_PER_GROUP(sb));
-
-repeat:
- if (grp_goal < 0 || !ext4_test_allocatable(grp_goal, bitmap_bh)) {
- grp_goal = find_next_usable_block(start, bitmap_bh, end);
- if (grp_goal < 0)
- goto fail_access;
- if (!my_rsv) {
- int i;
-
- for (i = 0; i < 7 && grp_goal > start &&
- ext4_test_allocatable(grp_goal - 1,
- bitmap_bh);
- i++, grp_goal--)
- ;
- }
- }
- start = grp_goal;
-
- if (!claim_block(sb_bgl_lock(EXT4_SB(sb), group),
- grp_goal, bitmap_bh)) {
- /*
- * The block was allocated by another thread, or it was
- * allocated and then freed by another thread
- */
- start++;
- grp_goal++;
- if (start >= end)
- goto fail_access;
- goto repeat;
- }
- num++;
- grp_goal++;
- while (num < *count && grp_goal < end
- && ext4_test_allocatable(grp_goal, bitmap_bh)
- && claim_block(sb_bgl_lock(EXT4_SB(sb), group),
- grp_goal, bitmap_bh)) {
- num++;
- grp_goal++;
- }
- *count = num;
- return grp_goal - num;
-fail_access:
- *count = num;
- return -1;
-}
+ free_blocks = percpu_counter_read_positive(fbc);
+ dirty_blocks = percpu_counter_read_positive(dbc);
-/**
- * find_next_reservable_window():
- * find a reservable space within the given range.
- * It does not allocate the reservation window for now:
- * alloc_new_reservation() will do the work later.
- *
- * @search_head: the head of the searching list;
- * This is not necessarily the list head of the whole filesystem
- *
- * We have both head and start_block to assist the search
- * for the reservable space. The list starts from head,
- * but we will shift to the place where start_block is,
- * then start from there, when looking for a reservable space.
- *
- * @size: the target new reservation window size
- *
- * @group_first_block: the first block we consider to start
- * the real search from
- *
- * @last_block:
- * the maximum block number that our goal reservable space
- * could start from. This is normally the last block in this
- * group. The search will end when we found the start of next
- * possible reservable space is out of this boundary.
- * This could handle the cross boundary reservation window
- * request.
- *
- * basically we search from the given range, rather than the whole
- * reservation double linked list, (start_block, last_block)
- * to find a free region that is of my size and has not
- * been reserved.
- *
- */
-static int find_next_reservable_window(
- struct ext4_reserve_window_node *search_head,
- struct ext4_reserve_window_node *my_rsv,
- struct super_block * sb,
- ext4_fsblk_t start_block,
- ext4_fsblk_t last_block)
-{
- struct rb_node *next;
- struct ext4_reserve_window_node *rsv, *prev;
- ext4_fsblk_t cur;
- int size = my_rsv->rsv_goal_size;
-
- /* TODO: make the start of the reservation window byte-aligned */
- /* cur = *start_block & ~7;*/
- cur = start_block;
- rsv = search_head;
- if (!rsv)
- return -1;
-
- while (1) {
- if (cur <= rsv->rsv_end)
- cur = rsv->rsv_end + 1;
-
- /* TODO?
- * in the case we could not find a reservable space
- * that is what is expected, during the re-search, we could
- * remember what's the largest reservable space we could have
- * and return that one.
- *
- * For now it will fail if we could not find the reservable
- * space with expected-size (or more)...
- */
- if (cur > last_block)
- return -1; /* fail */
-
- prev = rsv;
- next = rb_next(&rsv->rsv_node);
- rsv = rb_entry(next,struct ext4_reserve_window_node,rsv_node);
-
- /*
- * Reached the last reservation, we can just append to the
- * previous one.
- */
- if (!next)
- break;
-
- if (cur + size <= rsv->rsv_start) {
- /*
- * Found a reserveable space big enough. We could
- * have a reservation across the group boundary here
- */
- break;
+ if (!capable(CAP_SYS_RESOURCE) &&
+ sbi->s_resuid != current->fsuid &&
+ (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
+ root_blocks = ext4_r_blocks_count(sbi->s_es);
+
+ if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
+ EXT4_FREEBLOCKS_WATERMARK) {
+ free_blocks = percpu_counter_sum(fbc);
+ dirty_blocks = percpu_counter_sum(dbc);
+ if (dirty_blocks < 0) {
+ printk(KERN_CRIT "Dirty block accounting "
+ "went wrong %lld\n",
+ dirty_blocks);
}
}
- /*
- * we come here either :
- * when we reach the end of the whole list,
- * and there is empty reservable space after last entry in the list.
- * append it to the end of the list.
- *
- * or we found one reservable space in the middle of the list,
- * return the reservation window that we could append to.
- * succeed.
+ /* Check whether we have space after
+ * accounting for current dirty blocks
*/
+ if (free_blocks < ((root_blocks + nblocks) + dirty_blocks))
+ /* we don't have free space */
+ return -ENOSPC;
- if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window)))
- rsv_window_remove(sb, my_rsv);
-
- /*
- * Let's book the whole avaliable window for now. We will check the
- * disk bitmap later and then, if there are free blocks then we adjust
- * the window size if it's larger than requested.
- * Otherwise, we will remove this node from the tree next time
- * call find_next_reservable_window.
- */
- my_rsv->rsv_start = cur;
- my_rsv->rsv_end = cur + size - 1;
- my_rsv->rsv_alloc_hit = 0;
-
- if (prev != my_rsv)
- ext4_rsv_window_add(sb, my_rsv);
-
+ /* Add the blocks to nblocks */
+ percpu_counter_add(dbc, nblocks);
return 0;
}
/**
- * alloc_new_reservation()--allocate a new reservation window
- *
- * To make a new reservation, we search part of the filesystem
- * reservation list (the list that inside the group). We try to
- * allocate a new reservation window near the allocation goal,
- * or the beginning of the group, if there is no goal.
- *
- * We first find a reservable space after the goal, then from
- * there, we check the bitmap for the first free block after
- * it. If there is no free block until the end of group, then the
- * whole group is full, we failed. Otherwise, check if the free
- * block is inside the expected reservable space, if so, we
- * succeed.
- * If the first free block is outside the reservable space, then
- * start from the first free block, we search for next available
- * space, and go on.
- *
- * on succeed, a new reservation will be found and inserted into the list
- * It contains at least one free block, and it does not overlap with other
- * reservation windows.
- *
- * failed: we failed to find a reservation window in this group
- *
- * @rsv: the reservation
- *
- * @grp_goal: The goal (group-relative). It is where the search for a
- * free reservable space should start from.
- * if we have a grp_goal(grp_goal >0 ), then start from there,
- * no grp_goal(grp_goal = -1), we start from the first block
- * of the group.
- *
- * @sb: the super block
- * @group: the group we are trying to allocate in
- * @bitmap_bh: the block group block bitmap
- *
- */
-static int alloc_new_reservation(struct ext4_reserve_window_node *my_rsv,
- ext4_grpblk_t grp_goal, struct super_block *sb,
- ext4_group_t group, struct buffer_head *bitmap_bh)
-{
- struct ext4_reserve_window_node *search_head;
- ext4_fsblk_t group_first_block, group_end_block, start_block;
- ext4_grpblk_t first_free_block;
- struct rb_root *fs_rsv_root = &EXT4_SB(sb)->s_rsv_window_root;
- unsigned long size;
- int ret;
- spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock;
-
- group_first_block = ext4_group_first_block_no(sb, group);
- group_end_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
-
- if (grp_goal < 0)
- start_block = group_first_block;
- else
- start_block = grp_goal + group_first_block;
-
- size = my_rsv->rsv_goal_size;
-
- if (!rsv_is_empty(&my_rsv->rsv_window)) {
- /*
- * if the old reservation is cross group boundary
- * and if the goal is inside the old reservation window,
- * we will come here when we just failed to allocate from
- * the first part of the window. We still have another part
- * that belongs to the next group. In this case, there is no
- * point to discard our window and try to allocate a new one
- * in this group(which will fail). we should
- * keep the reservation window, just simply move on.
- *
- * Maybe we could shift the start block of the reservation
- * window to the first block of next group.
- */
-
- if ((my_rsv->rsv_start <= group_end_block) &&
- (my_rsv->rsv_end > group_end_block) &&
- (start_block >= my_rsv->rsv_start))
- return -1;
-
- if ((my_rsv->rsv_alloc_hit >
- (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
- /*
- * if the previously allocation hit ratio is
- * greater than 1/2, then we double the size of
- * the reservation window the next time,
- * otherwise we keep the same size window
- */
- size = size * 2;
- if (size > EXT4_MAX_RESERVE_BLOCKS)
- size = EXT4_MAX_RESERVE_BLOCKS;
- my_rsv->rsv_goal_size= size;
- }
- }
-
- spin_lock(rsv_lock);
- /*
- * shift the search start to the window near the goal block
- */
- search_head = search_reserve_window(fs_rsv_root, start_block);
-
- /*
- * find_next_reservable_window() simply finds a reservable window
- * inside the given range(start_block, group_end_block).
- *
- * To make sure the reservation window has a free bit inside it, we
- * need to check the bitmap after we found a reservable window.
- */
-retry:
- ret = find_next_reservable_window(search_head, my_rsv, sb,
- start_block, group_end_block);
-
- if (ret == -1) {
- if (!rsv_is_empty(&my_rsv->rsv_window))
- rsv_window_remove(sb, my_rsv);
- spin_unlock(rsv_lock);
- return -1;
- }
-
- /*
- * On success, find_next_reservable_window() returns the
- * reservation window where there is a reservable space after it.
- * Before we reserve this reservable space, we need
- * to make sure there is at least a free block inside this region.
- *
- * searching the first free bit on the block bitmap and copy of
- * last committed bitmap alternatively, until we found a allocatable
- * block. Search start from the start block of the reservable space
- * we just found.
- */
- spin_unlock(rsv_lock);
- first_free_block = bitmap_search_next_usable_block(
- my_rsv->rsv_start - group_first_block,
- bitmap_bh, group_end_block - group_first_block + 1);
-
- if (first_free_block < 0) {
- /*
- * no free block left on the bitmap, no point
- * to reserve the space. return failed.
- */
- spin_lock(rsv_lock);
- if (!rsv_is_empty(&my_rsv->rsv_window))
- rsv_window_remove(sb, my_rsv);
- spin_unlock(rsv_lock);
- return -1; /* failed */
- }
-
- start_block = first_free_block + group_first_block;
- /*
- * check if the first free block is within the
- * free space we just reserved
- */
- if (start_block >= my_rsv->rsv_start && start_block <= my_rsv->rsv_end)
- return 0; /* success */
- /*
- * if the first free bit we found is out of the reservable space
- * continue search for next reservable space,
- * start from where the free block is,
- * we also shift the list head to where we stopped last time
- */
- search_head = my_rsv;
- spin_lock(rsv_lock);
- goto retry;
-}
-
-/**
- * try_to_extend_reservation()
- * @my_rsv: given reservation window
- * @sb: super block
- * @size: the delta to extend
- *
- * Attempt to expand the reservation window large enough to have
- * required number of free blocks
- *
- * Since ext4_try_to_allocate() will always allocate blocks within
- * the reservation window range, if the window size is too small,
- * multiple blocks allocation has to stop at the end of the reservation
- * window. To make this more efficient, given the total number of
- * blocks needed and the current size of the window, we try to
- * expand the reservation window size if necessary on a best-effort
- * basis before ext4_new_blocks() tries to allocate blocks,
- */
-static void try_to_extend_reservation(struct ext4_reserve_window_node *my_rsv,
- struct super_block *sb, int size)
-{
- struct ext4_reserve_window_node *next_rsv;
- struct rb_node *next;
- spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock;
-
- if (!spin_trylock(rsv_lock))
- return;
-
- next = rb_next(&my_rsv->rsv_node);
-
- if (!next)
- my_rsv->rsv_end += size;
- else {
- next_rsv = rb_entry(next, struct ext4_reserve_window_node, rsv_node);
-
- if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size)
- my_rsv->rsv_end += size;
- else
- my_rsv->rsv_end = next_rsv->rsv_start - 1;
- }
- spin_unlock(rsv_lock);
-}
-
-/**
- * ext4_try_to_allocate_with_rsv()
- * @sb: superblock
- * @handle: handle to this transaction
- * @group: given allocation block group
- * @bitmap_bh: bufferhead holds the block bitmap
- * @grp_goal: given target block within the group
- * @count: target number of blocks to allocate
- * @my_rsv: reservation window
- * @errp: pointer to store the error code
- *
- * This is the main function used to allocate a new block and its reservation
- * window.
- *
- * Each time when a new block allocation is need, first try to allocate from
- * its own reservation. If it does not have a reservation window, instead of
- * looking for a free bit on bitmap first, then look up the reservation list to
- * see if it is inside somebody else's reservation window, we try to allocate a
- * reservation window for it starting from the goal first. Then do the block
- * allocation within the reservation window.
- *
- * This will avoid keeping on searching the reservation list again and
- * again when somebody is looking for a free block (without
- * reservation), and there are lots of free blocks, but they are all
- * being reserved.
- *
- * We use a red-black tree for the per-filesystem reservation list.
+ * ext4_has_free_blocks()
+ * @sbi: in-core super block structure.
+ * @nblocks: number of neeed blocks
*
+ * Check if filesystem has free blocks available for allocation.
+ * Return the number of blocks avaible for allocation for this request
+ * On success, return nblocks
*/
-static ext4_grpblk_t
-ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
- ext4_group_t group, struct buffer_head *bitmap_bh,
- ext4_grpblk_t grp_goal,
- struct ext4_reserve_window_node * my_rsv,
- unsigned long *count, int *errp)
+ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
+ s64 nblocks)
{
- ext4_fsblk_t group_first_block, group_last_block;
- ext4_grpblk_t ret = 0;
- int fatal;
- unsigned long num = *count;
-
- *errp = 0;
+ s64 free_blocks, dirty_blocks;
+ s64 root_blocks = 0;
+ struct percpu_counter *fbc = &sbi->s_freeblocks_counter;
+ struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter;
- /*
- * Make sure we use undo access for the bitmap, because it is critical
- * that we do the frozen_data COW on bitmap buffers in all cases even
- * if the buffer is in BJ_Forget state in the committing transaction.
- */
- BUFFER_TRACE(bitmap_bh, "get undo access for new block");
- fatal = ext4_journal_get_undo_access(handle, bitmap_bh);
- if (fatal) {
- *errp = fatal;
- return -1;
- }
+ free_blocks = percpu_counter_read_positive(fbc);
+ dirty_blocks = percpu_counter_read_positive(dbc);
- /*
- * we don't deal with reservation when
- * filesystem is mounted without reservation
- * or the file is not a regular file
- * or last attempt to allocate a block with reservation turned on failed
- */
- if (my_rsv == NULL ) {
- ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh,
- grp_goal, count, NULL);
- goto out;
- }
- /*
- * grp_goal is a group relative block number (if there is a goal)
- * 0 <= grp_goal < EXT4_BLOCKS_PER_GROUP(sb)
- * first block is a filesystem wide block number
- * first block is the block number of the first block in this group
- */
- group_first_block = ext4_group_first_block_no(sb, group);
- group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1);
-
- /*
- * Basically we will allocate a new block from inode's reservation
- * window.
- *
- * We need to allocate a new reservation window, if:
- * a) inode does not have a reservation window; or
- * b) last attempt to allocate a block from existing reservation
- * failed; or
- * c) we come here with a goal and with a reservation window
- *
- * We do not need to allocate a new reservation window if we come here
- * at the beginning with a goal and the goal is inside the window, or
- * we don't have a goal but already have a reservation window.
- * then we could go to allocate from the reservation window directly.
- */
- while (1) {
- if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
- !goal_in_my_reservation(&my_rsv->rsv_window,
- grp_goal, group, sb)) {
- if (my_rsv->rsv_goal_size < *count)
- my_rsv->rsv_goal_size = *count;
- ret = alloc_new_reservation(my_rsv, grp_goal, sb,
- group, bitmap_bh);
- if (ret < 0)
- break; /* failed */
-
- if (!goal_in_my_reservation(&my_rsv->rsv_window,
- grp_goal, group, sb))
- grp_goal = -1;
- } else if (grp_goal >= 0) {
- int curr = my_rsv->rsv_end -
- (grp_goal + group_first_block) + 1;
-
- if (curr < *count)
- try_to_extend_reservation(my_rsv, sb,
- *count - curr);
- }
+ if (!capable(CAP_SYS_RESOURCE) &&
+ sbi->s_resuid != current->fsuid &&
+ (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
+ root_blocks = ext4_r_blocks_count(sbi->s_es);
- if ((my_rsv->rsv_start > group_last_block) ||
- (my_rsv->rsv_end < group_first_block)) {
- rsv_window_dump(&EXT4_SB(sb)->s_rsv_window_root, 1);
- BUG();
- }
- ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh,
- grp_goal, &num, &my_rsv->rsv_window);
- if (ret >= 0) {
- my_rsv->rsv_alloc_hit += num;
- *count = num;
- break; /* succeed */
- }
- num = *count;
- }
-out:
- if (ret >= 0) {
- BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for "
- "bitmap block");
- fatal = ext4_journal_dirty_metadata(handle, bitmap_bh);
- if (fatal) {
- *errp = fatal;
- return -1;
- }
- return ret;
+ if (free_blocks - (nblocks + root_blocks + dirty_blocks) <
+ EXT4_FREEBLOCKS_WATERMARK) {
+ free_blocks = percpu_counter_sum(fbc);
+ dirty_blocks = percpu_counter_sum(dbc);
}
+ if (free_blocks <= (root_blocks + dirty_blocks))
+ /* we don't have free space */
+ return 0;
- BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
- ext4_journal_release_buffer(handle, bitmap_bh);
- return ret;
+ if (free_blocks - (root_blocks + dirty_blocks) < nblocks)
+ return free_blocks - (root_blocks + dirty_blocks);
+ return nblocks;
}
-/**
- * ext4_has_free_blocks()
- * @sbi: in-core super block structure.
- *
- * Check if filesystem has at least 1 free block available for allocation.
- */
-static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
-{
- ext4_fsblk_t free_blocks, root_blocks;
-
- free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
- root_blocks = ext4_r_blocks_count(sbi->s_es);
- if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
- sbi->s_resuid != current->fsuid &&
- (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
- return 0;
- }
- return 1;
-}
/**
* ext4_should_retry_alloc()
@@ -1630,7 +673,7 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
*/
int ext4_should_retry_alloc(struct super_block *sb, int *retries)
{
- if (!ext4_has_free_blocks(EXT4_SB(sb)) || (*retries)++ > 3)
+ if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3)
return 0;
jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
@@ -1638,323 +681,100 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
}
-/**
- * ext4_new_blocks_old() -- core block(s) allocation function
- * @handle: handle to this transaction
- * @inode: file inode
- * @goal: given target block(filesystem wide)
- * @count: target number of blocks to allocate
- * @errp: error code
- *
- * ext4_new_blocks uses a goal block to assist allocation. It tries to
- * allocate block(s) from the block group contains the goal block first. If that
- * fails, it will try to allocate block(s) from other block groups without
- * any specific goal block.
- *
- */
-ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
- ext4_fsblk_t goal, unsigned long *count, int *errp)
-{
- struct buffer_head *bitmap_bh = NULL;
- struct buffer_head *gdp_bh;
- ext4_group_t group_no;
- ext4_group_t goal_group;
- ext4_grpblk_t grp_target_blk; /* blockgroup relative goal block */
- ext4_grpblk_t grp_alloc_blk; /* blockgroup-relative allocated block*/
- ext4_fsblk_t ret_block; /* filesyetem-wide allocated block */
- ext4_group_t bgi; /* blockgroup iteration index */
- int fatal = 0, err;
- int performed_allocation = 0;
- ext4_grpblk_t free_blocks; /* number of free blocks in a group */
- struct super_block *sb;
- struct ext4_group_desc *gdp;
- struct ext4_super_block *es;
- struct ext4_sb_info *sbi;
- struct ext4_reserve_window_node *my_rsv = NULL;
- struct ext4_block_alloc_info *block_i;
- unsigned short windowsz = 0;
- ext4_group_t ngroups;
- unsigned long num = *count;
-
- *errp = -ENOSPC;
- sb = inode->i_sb;
- if (!sb) {
- printk("ext4_new_block: nonexistent device");
- return 0;
- }
-
- /*
- * Check quota for allocation of this block.
- */
- if (DQUOT_ALLOC_BLOCK(inode, num)) {
- *errp = -EDQUOT;
- return 0;
- }
+#define EXT4_META_BLOCK 0x1
- sbi = EXT4_SB(sb);
- es = EXT4_SB(sb)->s_es;
- ext4_debug("goal=%llu.\n", goal);
- /*
- * Allocate a block from reservation only when
- * filesystem is mounted with reservation(default,-o reservation), and
- * it's a regular file, and
- * the desired window size is greater than 0 (One could use ioctl
- * command EXT4_IOC_SETRSVSZ to set the window size to 0 to turn off
- * reservation on that particular file)
- */
- block_i = EXT4_I(inode)->i_block_alloc_info;
- if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
- my_rsv = &block_i->rsv_window_node;
-
- if (!ext4_has_free_blocks(sbi)) {
- *errp = -ENOSPC;
- goto out;
- }
-
- /*
- * First, test whether the goal block is free.
- */
- if (goal < le32_to_cpu(es->s_first_data_block) ||
- goal >= ext4_blocks_count(es))
- goal = le32_to_cpu(es->s_first_data_block);
- ext4_get_group_no_and_offset(sb, goal, &group_no, &grp_target_blk);
- goal_group = group_no;
-retry_alloc:
- gdp = ext4_get_group_desc(sb, group_no, &gdp_bh);
- if (!gdp)
- goto io_error;
-
- free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
- /*
- * if there is not enough free blocks to make a new resevation
- * turn off reservation for this allocation
- */
- if (my_rsv && (free_blocks < windowsz)
- && (rsv_is_empty(&my_rsv->rsv_window)))
- my_rsv = NULL;
-
- if (free_blocks > 0) {
- bitmap_bh = read_block_bitmap(sb, group_no);
- if (!bitmap_bh)
- goto io_error;
- grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
- group_no, bitmap_bh, grp_target_blk,
- my_rsv, &num, &fatal);
- if (fatal)
- goto out;
- if (grp_alloc_blk >= 0)
- goto allocated;
- }
-
- ngroups = EXT4_SB(sb)->s_groups_count;
- smp_rmb();
-
- /*
- * Now search the rest of the groups. We assume that
- * group_no and gdp correctly point to the last group visited.
- */
- for (bgi = 0; bgi < ngroups; bgi++) {
- group_no++;
- if (group_no >= ngroups)
- group_no = 0;
- gdp = ext4_get_group_desc(sb, group_no, &gdp_bh);
- if (!gdp)
- goto io_error;
- free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
- /*
- * skip this group if the number of
- * free blocks is less than half of the reservation
- * window size.
- */
- if (free_blocks <= (windowsz/2))
- continue;
-
- brelse(bitmap_bh);
- bitmap_bh = read_block_bitmap(sb, group_no);
- if (!bitmap_bh)
- goto io_error;
- /*
- * try to allocate block(s) from this group, without a goal(-1).
- */
- grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
- group_no, bitmap_bh, -1, my_rsv,
- &num, &fatal);
- if (fatal)
- goto out;
- if (grp_alloc_blk >= 0)
- goto allocated;
- }
- /*
- * We may end up a bogus ealier ENOSPC error due to
- * filesystem is "full" of reservations, but
- * there maybe indeed free blocks avaliable on disk
- * In this case, we just forget about the reservations
- * just do block allocation as without reservations.
- */
- if (my_rsv) {
- my_rsv = NULL;
- windowsz = 0;
- group_no = goal_group;
- goto retry_alloc;
- }
- /* No space left on the device */
- *errp = -ENOSPC;
- goto out;
-
-allocated:
-
- ext4_debug("using block group %lu(%d)\n",
- group_no, gdp->bg_free_blocks_count);
-
- BUFFER_TRACE(gdp_bh, "get_write_access");
- fatal = ext4_journal_get_write_access(handle, gdp_bh);
- if (fatal)
- goto out;
-
- ret_block = grp_alloc_blk + ext4_group_first_block_no(sb, group_no);
-
- if (in_range(ext4_block_bitmap(sb, gdp), ret_block, num) ||
- in_range(ext4_inode_bitmap(sb, gdp), ret_block, num) ||
- in_range(ret_block, ext4_inode_table(sb, gdp),
- EXT4_SB(sb)->s_itb_per_group) ||
- in_range(ret_block + num - 1, ext4_inode_table(sb, gdp),
- EXT4_SB(sb)->s_itb_per_group)) {
- ext4_error(sb, "ext4_new_block",
- "Allocating block in system zone - "
- "blocks from %llu, length %lu",
- ret_block, num);
- /*
- * claim_block marked the blocks we allocated
- * as in use. So we may want to selectively
- * mark some of the blocks as free
- */
- goto retry_alloc;
- }
-
- performed_allocation = 1;
-
-#ifdef CONFIG_JBD2_DEBUG
- {
- struct buffer_head *debug_bh;
-
- /* Record bitmap buffer state in the newly allocated block */
- debug_bh = sb_find_get_block(sb, ret_block);
- if (debug_bh) {
- BUFFER_TRACE(debug_bh, "state when allocated");
- BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
- brelse(debug_bh);
- }
- }
- jbd_lock_bh_state(bitmap_bh);
- spin_lock(sb_bgl_lock(sbi, group_no));
- if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) {
- int i;
-
- for (i = 0; i < num; i++) {
- if (ext4_test_bit(grp_alloc_blk+i,
- bh2jh(bitmap_bh)->b_committed_data)) {
- printk("%s: block was unexpectedly set in "
- "b_committed_data\n", __func__);
- }
- }
- }
- ext4_debug("found bit %d\n", grp_alloc_blk);
- spin_unlock(sb_bgl_lock(sbi, group_no));
- jbd_unlock_bh_state(bitmap_bh);
-#endif
-
- if (ret_block + num - 1 >= ext4_blocks_count(es)) {
- ext4_error(sb, "ext4_new_block",
- "block(%llu) >= blocks count(%llu) - "
- "block_group = %lu, es == %p ", ret_block,
- ext4_blocks_count(es), group_no, es);
- goto out;
- }
-
- /*
- * It is up to the caller to add the new buffer to a journal
- * list of some description. We don't know in advance whether
- * the caller wants to use it as metadata or data.
- */
- spin_lock(sb_bgl_lock(sbi, group_no));
- if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
- gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
- le16_add_cpu(&gdp->bg_free_blocks_count, -num);
- gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
- spin_unlock(sb_bgl_lock(sbi, group_no));
- percpu_counter_sub(&sbi->s_freeblocks_counter, num);
-
- BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
- err = ext4_journal_dirty_metadata(handle, gdp_bh);
- if (!fatal)
- fatal = err;
-
- sb->s_dirt = 1;
- if (fatal)
- goto out;
-
- *errp = 0;
- brelse(bitmap_bh);
- DQUOT_FREE_BLOCK(inode, *count-num);
- *count = num;
- return ret_block;
-
-io_error:
- *errp = -EIO;
-out:
- if (fatal) {
- *errp = fatal;
- ext4_std_error(sb, fatal);
- }
- /*
- * Undo the block allocation
- */
- if (!performed_allocation)
- DQUOT_FREE_BLOCK(inode, *count);
- brelse(bitmap_bh);
- return 0;
-}
-
-ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode,
- ext4_fsblk_t goal, int *errp)
+static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
+ ext4_lblk_t iblock, ext4_fsblk_t goal,
+ unsigned long *count, int *errp, int flags)
{
struct ext4_allocation_request ar;
ext4_fsblk_t ret;
- if (!test_opt(inode->i_sb, MBALLOC)) {
- unsigned long count = 1;
- ret = ext4_new_blocks_old(handle, inode, goal, &count, errp);
- return ret;
- }
-
memset(&ar, 0, sizeof(ar));
+ /* Fill with neighbour allocated blocks */
+
ar.inode = inode;
ar.goal = goal;
- ar.len = 1;
+ ar.len = *count;
+ ar.logical = iblock;
+
+ if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
+ /* enable in-core preallocation for data block allocation */
+ ar.flags = EXT4_MB_HINT_DATA;
+ else
+ /* disable in-core preallocation for non-regular files */
+ ar.flags = 0;
+
ret = ext4_mb_new_blocks(handle, &ar, errp);
+ *count = ar.len;
return ret;
}
-ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+/*
+ * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
+ *
+ * @handle: handle to this transaction
+ * @inode: file inode
+ * @goal: given target block(filesystem wide)
+ * @count: total number of blocks need
+ * @errp: error code
+ *
+ * Return 1st allocated block numberon success, *count stores total account
+ * error stores in errp pointer
+ */
+ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp)
{
- struct ext4_allocation_request ar;
ext4_fsblk_t ret;
-
- if (!test_opt(inode->i_sb, MBALLOC)) {
- ret = ext4_new_blocks_old(handle, inode, goal, count, errp);
- return ret;
+ ret = do_blk_alloc(handle, inode, 0, goal,
+ count, errp, EXT4_META_BLOCK);
+ /*
+ * Account for the allocated meta blocks
+ */
+ if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+ EXT4_I(inode)->i_allocated_meta_blocks += *count;
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
}
-
- memset(&ar, 0, sizeof(ar));
- ar.inode = inode;
- ar.goal = goal;
- ar.len = *count;
- ret = ext4_mb_new_blocks(handle, &ar, errp);
- *count = ar.len;
return ret;
}
+/*
+ * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks
+ *
+ * @handle: handle to this transaction
+ * @inode: file inode
+ * @goal: given target block(filesystem wide)
+ * @errp: error code
+ *
+ * Return allocated block number on success
+ */
+ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
+ ext4_fsblk_t goal, int *errp)
+{
+ unsigned long count = 1;
+ return ext4_new_meta_blocks(handle, inode, goal, &count, errp);
+}
+
+/*
+ * ext4_new_blocks() -- allocate data blocks
+ *
+ * @handle: handle to this transaction
+ * @inode: file inode
+ * @goal: given target block(filesystem wide)
+ * @count: total number of blocks need
+ * @errp: error code
+ *
+ * Return 1st allocated block numberon success, *count stores total account
+ * error stores in errp pointer
+ */
+
+ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+ ext4_lblk_t iblock, ext4_fsblk_t goal,
+ unsigned long *count, int *errp)
+{
+ return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0);
+}
/**
* ext4_count_free_blocks() -- count filesystem free blocks
@@ -1986,7 +806,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
continue;
desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
brelse(bitmap_bh);
- bitmap_bh = read_block_bitmap(sb, i);
+ bitmap_bh = ext4_read_block_bitmap(sb, i);
if (bitmap_bh == NULL)
continue;
@@ -1996,10 +816,9 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
bitmap_count += x;
}
brelse(bitmap_bh);
- printk("ext4_count_free_blocks: stored = %llu"
- ", computed = %llu, %llu\n",
- ext4_free_blocks_count(es),
- desc_count, bitmap_count);
+ printk(KERN_DEBUG "ext4_count_free_blocks: stored = %llu"
+ ", computed = %llu, %llu\n", ext4_free_blocks_count(es),
+ desc_count, bitmap_count);
return bitmap_count;
#else
desc_count = 0;
@@ -2086,8 +905,9 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) ||
metagroup < first_meta_bg)
- return ext4_bg_num_gdb_nometa(sb,group);
+ return ext4_bg_num_gdb_nometa(sb, group);
return ext4_bg_num_gdb_meta(sb,group);
}
+
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index d37ea675045..0a7a6663c19 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -15,17 +15,17 @@
static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
-unsigned long ext4_count_free (struct buffer_head * map, unsigned int numchars)
+unsigned long ext4_count_free(struct buffer_head *map, unsigned int numchars)
{
unsigned int i;
unsigned long sum = 0;
if (!map)
- return (0);
+ return 0;
for (i = 0; i < numchars; i++)
sum += nibblemap[map->b_data[i] & 0xf] +
nibblemap[(map->b_data[i] >> 4) & 0xf];
- return (sum);
+ return sum;
}
#endif /* EXT4FS_DEBUG */
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 2bf0331ea19..3ca6a2b7632 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -33,10 +33,10 @@ static unsigned char ext4_filetype_table[] = {
};
static int ext4_readdir(struct file *, void *, filldir_t);
-static int ext4_dx_readdir(struct file * filp,
- void * dirent, filldir_t filldir);
-static int ext4_release_dir (struct inode * inode,
- struct file * filp);
+static int ext4_dx_readdir(struct file *filp,
+ void *dirent, filldir_t filldir);
+static int ext4_release_dir(struct inode *inode,
+ struct file *filp);
const struct file_operations ext4_dir_operations = {
.llseek = generic_file_llseek,
@@ -61,12 +61,12 @@ static unsigned char get_dtype(struct super_block *sb, int filetype)
}
-int ext4_check_dir_entry (const char * function, struct inode * dir,
- struct ext4_dir_entry_2 * de,
- struct buffer_head * bh,
- unsigned long offset)
+int ext4_check_dir_entry(const char *function, struct inode *dir,
+ struct ext4_dir_entry_2 *de,
+ struct buffer_head *bh,
+ unsigned long offset)
{
- const char * error_msg = NULL;
+ const char *error_msg = NULL;
const int rlen = ext4_rec_len_from_disk(de->rec_len);
if (rlen < EXT4_DIR_REC_LEN(1))
@@ -82,7 +82,7 @@ int ext4_check_dir_entry (const char * function, struct inode * dir,
error_msg = "inode out of bounds";
if (error_msg != NULL)
- ext4_error (dir->i_sb, function,
+ ext4_error(dir->i_sb, function,
"bad entry in directory #%lu: %s - "
"offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
dir->i_ino, error_msg, offset,
@@ -91,8 +91,8 @@ int ext4_check_dir_entry (const char * function, struct inode * dir,
return error_msg == NULL ? 1 : 0;
}
-static int ext4_readdir(struct file * filp,
- void * dirent, filldir_t filldir)
+static int ext4_readdir(struct file *filp,
+ void *dirent, filldir_t filldir)
{
int error = 0;
unsigned long offset;
@@ -102,6 +102,7 @@ static int ext4_readdir(struct file * filp,
int err;
struct inode *inode = filp->f_path.dentry->d_inode;
int ret = 0;
+ int dir_has_error = 0;
sb = inode->i_sb;
@@ -129,7 +130,8 @@ static int ext4_readdir(struct file * filp,
struct buffer_head *bh = NULL;
map_bh.b_state = 0;
- err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 0, 0);
+ err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh,
+ 0, 0, 0);
if (err > 0) {
pgoff_t index = map_bh.b_blocknr >>
(PAGE_CACHE_SHIFT - inode->i_blkbits);
@@ -147,9 +149,13 @@ static int ext4_readdir(struct file * filp,
* of recovering data when there's a bad sector
*/
if (!bh) {
- ext4_error (sb, "ext4_readdir",
- "directory #%lu contains a hole at offset %lu",
- inode->i_ino, (unsigned long)filp->f_pos);
+ if (!dir_has_error) {
+ ext4_error(sb, __func__, "directory #%lu "
+ "contains a hole at offset %Lu",
+ inode->i_ino,
+ (unsigned long long) filp->f_pos);
+ dir_has_error = 1;
+ }
/* corrupt size? Maybe no more blocks to read */
if (filp->f_pos > inode->i_blocks << 9)
break;
@@ -186,14 +192,14 @@ revalidate:
while (!error && filp->f_pos < inode->i_size
&& offset < sb->s_blocksize) {
de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
- if (!ext4_check_dir_entry ("ext4_readdir", inode, de,
- bh, offset)) {
+ if (!ext4_check_dir_entry("ext4_readdir", inode, de,
+ bh, offset)) {
/*
* On error, skip the f_pos to the next block
*/
filp->f_pos = (filp->f_pos |
(sb->s_blocksize - 1)) + 1;
- brelse (bh);
+ brelse(bh);
ret = stored;
goto out;
}
@@ -217,12 +223,12 @@ revalidate:
break;
if (version != filp->f_version)
goto revalidate;
- stored ++;
+ stored++;
}
filp->f_pos += ext4_rec_len_from_disk(de->rec_len);
}
offset = 0;
- brelse (bh);
+ brelse(bh);
}
out:
return ret;
@@ -272,7 +278,7 @@ static void free_rb_tree_fname(struct rb_root *root)
while (n) {
/* Do the node's children first */
- if ((n)->rb_left) {
+ if (n->rb_left) {
n = n->rb_left;
continue;
}
@@ -289,9 +295,9 @@ static void free_rb_tree_fname(struct rb_root *root)
parent = rb_parent(n);
fname = rb_entry(n, struct fname, rb_hash);
while (fname) {
- struct fname * old = fname;
+ struct fname *old = fname;
fname = fname->next;
- kfree (old);
+ kfree(old);
}
if (!parent)
root->rb_node = NULL;
@@ -301,24 +307,18 @@ static void free_rb_tree_fname(struct rb_root *root)
parent->rb_right = NULL;
n = parent;
}
- root->rb_node = NULL;
}
-static struct dir_private_info *create_dir_info(loff_t pos)
+static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos)
{
struct dir_private_info *p;
- p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
+ p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
if (!p)
return NULL;
- p->root.rb_node = NULL;
- p->curr_node = NULL;
- p->extra_fname = NULL;
- p->last_pos = 0;
p->curr_hash = pos2maj_hash(pos);
p->curr_minor_hash = pos2min_hash(pos);
- p->next_hash = 0;
return p;
}
@@ -336,7 +336,7 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
struct ext4_dir_entry_2 *dirent)
{
struct rb_node **p, *parent = NULL;
- struct fname * fname, *new_fn;
+ struct fname *fname, *new_fn;
struct dir_private_info *info;
int len;
@@ -393,19 +393,20 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
* for all entres on the fname linked list. (Normally there is only
* one entry on the linked list, unless there are 62 bit hash collisions.)
*/
-static int call_filldir(struct file * filp, void * dirent,
+static int call_filldir(struct file *filp, void *dirent,
filldir_t filldir, struct fname *fname)
{
struct dir_private_info *info = filp->private_data;
loff_t curr_pos;
struct inode *inode = filp->f_path.dentry->d_inode;
- struct super_block * sb;
+ struct super_block *sb;
int error;
sb = inode->i_sb;
if (!fname) {
- printk("call_filldir: called with null fname?!?\n");
+ printk(KERN_ERR "ext4: call_filldir: called with "
+ "null fname?!?\n");
return 0;
}
curr_pos = hash2pos(fname->hash, fname->minor_hash);
@@ -416,7 +417,7 @@ static int call_filldir(struct file * filp, void * dirent,
get_dtype(sb, fname->file_type));
if (error) {
filp->f_pos = curr_pos;
- info->extra_fname = fname->next;
+ info->extra_fname = fname;
return error;
}
fname = fname->next;
@@ -424,8 +425,8 @@ static int call_filldir(struct file * filp, void * dirent,
return 0;
}
-static int ext4_dx_readdir(struct file * filp,
- void * dirent, filldir_t filldir)
+static int ext4_dx_readdir(struct file *filp,
+ void *dirent, filldir_t filldir)
{
struct dir_private_info *info = filp->private_data;
struct inode *inode = filp->f_path.dentry->d_inode;
@@ -433,7 +434,7 @@ static int ext4_dx_readdir(struct file * filp,
int ret;
if (!info) {
- info = create_dir_info(filp->f_pos);
+ info = ext4_htree_create_dir_info(filp->f_pos);
if (!info)
return -ENOMEM;
filp->private_data = info;
@@ -455,11 +456,21 @@ static int ext4_dx_readdir(struct file * filp,
* If there are any leftover names on the hash collision
* chain, return them first.
*/
- if (info->extra_fname &&
- call_filldir(filp, dirent, filldir, info->extra_fname))
- goto finished;
+ if (info->extra_fname) {
+ if (call_filldir(filp, dirent, filldir, info->extra_fname))
+ goto finished;
- if (!info->curr_node)
+ info->extra_fname = NULL;
+ info->curr_node = rb_next(info->curr_node);
+ if (!info->curr_node) {
+ if (info->next_hash == ~0) {
+ filp->f_pos = EXT4_HTREE_EOF;
+ goto finished;
+ }
+ info->curr_hash = info->next_hash;
+ info->curr_minor_hash = 0;
+ }
+ } else if (!info->curr_node)
info->curr_node = rb_first(&info->root);
while (1) {
@@ -506,7 +517,7 @@ finished:
return 0;
}
-static int ext4_release_dir (struct inode * inode, struct file * filp)
+static int ext4_release_dir(struct inode *inode, struct file *filp)
{
if (filp->private_data)
ext4_htree_free_dir_info(filp->private_data);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8158083f7ac..6690a41cdd9 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -22,7 +22,7 @@
#include "ext4_i.h"
/*
- * The second extended filesystem constants/structures
+ * The fourth extended filesystem constants/structures
*/
/*
@@ -44,9 +44,9 @@
#ifdef EXT4FS_DEBUG
#define ext4_debug(f, a...) \
do { \
- printk (KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \
- __FILE__, __LINE__, __FUNCTION__); \
- printk (KERN_DEBUG f, ## a); \
+ printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \
+ __FILE__, __LINE__, __func__); \
+ printk(KERN_DEBUG f, ## a); \
} while (0)
#else
#define ext4_debug(f, a...) do {} while (0)
@@ -74,6 +74,9 @@
#define EXT4_MB_HINT_GOAL_ONLY 256
/* goal is meaningful */
#define EXT4_MB_HINT_TRY_GOAL 512
+/* blocks already pre-reserved by delayed allocation */
+#define EXT4_MB_DELALLOC_RESERVED 1024
+
struct ext4_allocation_request {
/* target inode for block we're allocating */
@@ -125,7 +128,7 @@ struct ext4_allocation_request {
#else
# define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size)
#endif
-#define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof (__u32))
+#define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof(__u32))
#ifdef __KERNEL__
# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits)
#else
@@ -170,6 +173,15 @@ struct ext4_group_desc
__u32 bg_reserved2[3];
};
+/*
+ * Structure of a flex block group info
+ */
+
+struct flex_groups {
+ __u32 free_inodes;
+ __u32 free_blocks;
+};
+
#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */
#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */
#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */
@@ -233,7 +245,7 @@ struct ext4_group_desc
#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */
-#define EXT4_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */
+#define EXT4_FL_USER_MODIFIABLE 0x000B80FF /* User modifiable flags */
/*
* Inode dynamic state flags
@@ -279,8 +291,6 @@ struct ext4_new_group_data {
#define EXT4_IOC_SETFLAGS FS_IOC_SETFLAGS
#define EXT4_IOC_GETVERSION _IOR('f', 3, long)
#define EXT4_IOC_SETVERSION _IOW('f', 4, long)
-#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long)
-#define EXT4_IOC_GROUP_ADD _IOW('f', 8,struct ext4_new_group_input)
#define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION
#define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION
#ifdef CONFIG_JBD2_DEBUG
@@ -288,7 +298,10 @@ struct ext4_new_group_data {
#endif
#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long)
#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long)
-#define EXT4_IOC_MIGRATE _IO('f', 7)
+#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long)
+#define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input)
+#define EXT4_IOC_MIGRATE _IO('f', 9)
+ /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
/*
* ioctl commands in 32 bit emulation
@@ -526,7 +539,9 @@ do { \
#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
-#define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */
+#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
+#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
+
/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
#ifndef _LINUX_EXT2_FS_H
#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
@@ -647,11 +662,14 @@ struct ext4_super_block {
__le16 s_mmp_interval; /* # seconds to wait in MMP checking */
__le64 s_mmp_block; /* Block for multi-mount protection */
__le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
- __u32 s_reserved[163]; /* Padding to the end of the block */
+ __u8 s_log_groups_per_flex; /* FLEX_BG group size */
+ __u8 s_reserved_char_pad2;
+ __le16 s_reserved_pad;
+ __u32 s_reserved[162]; /* Padding to the end of the block */
};
#ifdef __KERNEL__
-static inline struct ext4_sb_info * EXT4_SB(struct super_block *sb)
+static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
{
return sb->s_fs_info;
}
@@ -709,11 +727,11 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
*/
#define EXT4_HAS_COMPAT_FEATURE(sb,mask) \
- ( EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) )
+ (EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask))
#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \
- ( EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) )
+ (EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask))
#define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \
- ( EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) )
+ (EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask))
#define EXT4_SET_COMPAT_FEATURE(sb,mask) \
EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
#define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \
@@ -773,6 +791,8 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
#define EXT4_DEF_RESUID 0
#define EXT4_DEF_RESGID 0
+#define EXT4_DEF_INODE_READAHEAD_BLKS 32
+
/*
* Default mount options
*/
@@ -938,6 +958,24 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
unsigned long *blockgrpp, ext4_grpblk_t *offsetp);
+extern struct proc_dir_entry *ext4_proc_root;
+
+#ifdef CONFIG_PROC_FS
+extern const struct file_operations ext4_ui_proc_fops;
+
+#define EXT4_PROC_HANDLER(name, var) \
+do { \
+ proc = proc_create_data(name, mode, sbi->s_proc, \
+ &ext4_ui_proc_fops, &sbi->s_##var); \
+ if (proc == NULL) { \
+ printk(KERN_ERR "EXT4-fs: can't create %s\n", name); \
+ goto err_out; \
+ } \
+} while (0)
+#else
+#define EXT4_PROC_HANDLER(name, var)
+#endif
+
/*
* Function prototypes
*/
@@ -958,25 +996,27 @@ extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
ext4_group_t group);
-extern ext4_fsblk_t ext4_new_block (handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, int *errp);
-extern ext4_fsblk_t ext4_new_blocks (handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t goal, unsigned long *count, int *errp);
-extern ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
- ext4_fsblk_t goal, unsigned long *count, int *errp);
-extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+ ext4_lblk_t iblock, ext4_fsblk_t goal,
+ unsigned long *count, int *errp);
+extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
+extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
+ s64 nblocks);
+extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t block, unsigned long count, int metadata);
-extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb,
- ext4_fsblk_t block, unsigned long count,
+extern void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb,
+ ext4_fsblk_t block, unsigned long count,
unsigned long *pdquot_freed_blocks);
-extern ext4_fsblk_t ext4_count_free_blocks (struct super_block *);
-extern void ext4_check_blocks_bitmap (struct super_block *);
+extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
+extern void ext4_check_blocks_bitmap(struct super_block *);
extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
ext4_group_t block_group,
struct buffer_head ** bh);
extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
-extern void ext4_init_block_alloc_info(struct inode *);
-extern void ext4_rsv_window_add(struct super_block *sb, struct ext4_reserve_window_node *rsv);
/* dir.c */
extern int ext4_check_dir_entry(const char *, struct inode *,
@@ -988,20 +1028,20 @@ extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
extern void ext4_htree_free_dir_info(struct dir_private_info *p);
/* fsync.c */
-extern int ext4_sync_file (struct file *, struct dentry *, int);
+extern int ext4_sync_file(struct file *, struct dentry *, int);
/* hash.c */
extern int ext4fs_dirhash(const char *name, int len, struct
dx_hash_info *hinfo);
/* ialloc.c */
-extern struct inode * ext4_new_inode (handle_t *, struct inode *, int);
-extern void ext4_free_inode (handle_t *, struct inode *);
-extern struct inode * ext4_orphan_get (struct super_block *, unsigned long);
-extern unsigned long ext4_count_free_inodes (struct super_block *);
-extern unsigned long ext4_count_dirs (struct super_block *);
-extern void ext4_check_inodes_bitmap (struct super_block *);
-extern unsigned long ext4_count_free (struct buffer_head *, unsigned);
+extern struct inode * ext4_new_inode(handle_t *, struct inode *, int);
+extern void ext4_free_inode(handle_t *, struct inode *);
+extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
+extern unsigned long ext4_count_free_inodes(struct super_block *);
+extern unsigned long ext4_count_dirs(struct super_block *);
+extern void ext4_check_inodes_bitmap(struct super_block *);
+extern unsigned long ext4_count_free(struct buffer_head *, unsigned);
/* mballoc.c */
extern long ext4_mb_stats;
@@ -1011,11 +1051,15 @@ extern int ext4_mb_release(struct super_block *);
extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
struct ext4_allocation_request *, int *);
extern int ext4_mb_reserve_blocks(struct super_block *, int);
-extern void ext4_mb_discard_inode_preallocations(struct inode *);
+extern void ext4_discard_preallocations(struct inode *);
extern int __init init_ext4_mballoc(void);
extern void exit_ext4_mballoc(void);
extern void ext4_mb_free_blocks(handle_t *, struct inode *,
unsigned long, unsigned long, int, unsigned long *);
+extern int ext4_mb_add_more_groupinfo(struct super_block *sb,
+ ext4_group_t i, struct ext4_group_desc *desc);
+extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
+ ext4_grpblk_t add);
/* inode.c */
@@ -1025,35 +1069,41 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *,
ext4_lblk_t, int, int *);
struct buffer_head *ext4_bread(handle_t *, struct inode *,
ext4_lblk_t, int, int *);
+int ext4_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create);
int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
ext4_lblk_t iblock, unsigned long maxblocks,
struct buffer_head *bh_result,
int create, int extend_disksize);
extern struct inode *ext4_iget(struct super_block *, unsigned long);
-extern int ext4_write_inode (struct inode *, int);
-extern int ext4_setattr (struct dentry *, struct iattr *);
-extern void ext4_delete_inode (struct inode *);
-extern int ext4_sync_inode (handle_t *, struct inode *);
-extern void ext4_discard_reservation (struct inode *);
+extern int ext4_write_inode(struct inode *, int);
+extern int ext4_setattr(struct dentry *, struct iattr *);
+extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat);
+extern void ext4_delete_inode(struct inode *);
+extern int ext4_sync_inode(handle_t *, struct inode *);
extern void ext4_dirty_inode(struct inode *);
extern int ext4_change_inode_journal_flag(struct inode *, int);
extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
-extern void ext4_truncate (struct inode *);
+extern int ext4_can_truncate(struct inode *inode);
+extern void ext4_truncate(struct inode *);
extern void ext4_set_inode_flags(struct inode *);
extern void ext4_get_inode_flags(struct ext4_inode_info *);
extern void ext4_set_aops(struct inode *inode);
extern int ext4_writepage_trans_blocks(struct inode *);
-extern int ext4_block_truncate_page(handle_t *handle, struct page *page,
+extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
+extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
+extern int ext4_block_truncate_page(handle_t *handle,
struct address_space *mapping, loff_t from);
+extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
/* ioctl.c */
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
-extern long ext4_compat_ioctl (struct file *, unsigned int, unsigned long);
+extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
/* migrate.c */
-extern int ext4_ext_migrate(struct inode *, struct file *, unsigned int,
- unsigned long);
+extern int ext4_ext_migrate(struct inode *);
/* namei.c */
extern int ext4_orphan_add(handle_t *, struct inode *);
extern int ext4_orphan_del(handle_t *, struct inode *);
@@ -1068,14 +1118,14 @@ extern int ext4_group_extend(struct super_block *sb,
ext4_fsblk_t n_blocks_count);
/* super.c */
-extern void ext4_error (struct super_block *, const char *, const char *, ...)
+extern void ext4_error(struct super_block *, const char *, const char *, ...)
__attribute__ ((format (printf, 3, 4)));
-extern void __ext4_std_error (struct super_block *, const char *, int);
-extern void ext4_abort (struct super_block *, const char *, const char *, ...)
+extern void __ext4_std_error(struct super_block *, const char *, int);
+extern void ext4_abort(struct super_block *, const char *, const char *, ...)
__attribute__ ((format (printf, 3, 4)));
-extern void ext4_warning (struct super_block *, const char *, const char *, ...)
+extern void ext4_warning(struct super_block *, const char *, const char *, ...)
__attribute__ ((format (printf, 3, 4)));
-extern void ext4_update_dynamic_rev (struct super_block *sb);
+extern void ext4_update_dynamic_rev(struct super_block *sb);
extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
__u32 compat);
extern int ext4_update_rocompat_feature(handle_t *handle,
@@ -1148,7 +1198,7 @@ static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
static inline
struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
- ext4_group_t group)
+ ext4_group_t group)
{
struct ext4_group_info ***grp_info;
long indexv, indexh;
@@ -1159,12 +1209,45 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
}
+static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
+ ext4_group_t block_group)
+{
+ return block_group >> sbi->s_log_groups_per_flex;
+}
+
+static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
+{
+ return 1 << sbi->s_log_groups_per_flex;
+}
+
#define ext4_std_error(sb, errno) \
do { \
if ((errno)) \
- __ext4_std_error((sb), __FUNCTION__, (errno)); \
+ __ext4_std_error((sb), __func__, (errno)); \
} while (0)
+#ifdef CONFIG_SMP
+/* Each CPU can accumulate FBC_BATCH blocks in their local
+ * counters. So we need to make sure we have free blocks more
+ * than FBC_BATCH * nr_cpu_ids. Also add a window of 4 times.
+ */
+#define EXT4_FREEBLOCKS_WATERMARK (4 * (FBC_BATCH * nr_cpu_ids))
+#else
+#define EXT4_FREEBLOCKS_WATERMARK 0
+#endif
+
+static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
+{
+ /*
+ * XXX: replace with spinlock if seen contended -bzzz
+ */
+ down_write(&EXT4_I(inode)->i_data_sem);
+ if (newsize > EXT4_I(inode)->i_disksize)
+ EXT4_I(inode)->i_disksize = newsize;
+ up_write(&EXT4_I(inode)->i_data_sem);
+ return ;
+}
+
/*
* Inodes and files operations
*/
@@ -1187,11 +1270,13 @@ extern const struct inode_operations ext4_fast_symlink_inode_operations;
/* extents.c */
extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
+extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
+ int chunk);
extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
ext4_lblk_t iblock,
unsigned long max_blocks, struct buffer_head *bh_result,
int create, int extend_disksize);
-extern void ext4_ext_truncate(struct inode *, struct page *);
+extern void ext4_ext_truncate(struct inode *);
extern void ext4_ext_init(struct super_block *);
extern void ext4_ext_release(struct super_block *);
extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
@@ -1199,7 +1284,7 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
sector_t block, unsigned long max_blocks,
struct buffer_head *bh, int create,
- int extend_disksize);
+ int extend_disksize, int flag);
#endif /* __KERNEL__ */
#endif /* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 75333b595fa..bec7ce59fc0 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -124,6 +124,19 @@ struct ext4_ext_path {
#define EXT4_EXT_CACHE_GAP 1
#define EXT4_EXT_CACHE_EXTENT 2
+/*
+ * to be called by ext4_ext_walk_space()
+ * negative retcode - error
+ * positive retcode - signal for ext4_ext_walk_space(), see below
+ * callback must return valid extent (passed or newly created)
+ */
+typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *,
+ struct ext4_ext_cache *,
+ struct ext4_extent *, void *);
+
+#define EXT_CONTINUE 0
+#define EXT_BREAK 1
+#define EXT_REPEAT 2
#define EXT_MAX_BLOCK 0xffffffff
@@ -212,15 +225,20 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
(le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
}
+extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
extern int ext4_extent_tree_init(handle_t *, struct inode *);
-extern int ext4_ext_calc_credits_for_insert(struct inode *, struct ext4_ext_path *);
+extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
+ int num,
+ struct ext4_ext_path *path);
extern int ext4_ext_try_to_merge(struct inode *inode,
struct ext4_ext_path *path,
struct ext4_extent *);
extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *);
+extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
+ ext_prepare_callback, void *);
extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
struct ext4_ext_path *);
extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index 26a4ae255d7..5c124c0ac6d 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -33,38 +33,6 @@ typedef __u32 ext4_lblk_t;
/* data type for block group number */
typedef unsigned long ext4_group_t;
-struct ext4_reserve_window {
- ext4_fsblk_t _rsv_start; /* First byte reserved */
- ext4_fsblk_t _rsv_end; /* Last byte reserved or 0 */
-};
-
-struct ext4_reserve_window_node {
- struct rb_node rsv_node;
- __u32 rsv_goal_size;
- __u32 rsv_alloc_hit;
- struct ext4_reserve_window rsv_window;
-};
-
-struct ext4_block_alloc_info {
- /* information about reservation window */
- struct ext4_reserve_window_node rsv_window_node;
- /*
- * was i_next_alloc_block in ext4_inode_info
- * is the logical (file-relative) number of the
- * most-recently-allocated block in this file.
- * We use this for detecting linearly ascending allocation requests.
- */
- ext4_lblk_t last_alloc_logical_block;
- /*
- * Was i_next_alloc_goal in ext4_inode_info
- * is the *physical* companion to i_next_alloc_block.
- * it the physical block number of the block which was most-recentl
- * allocated to this file. This give us the goal (target) for the next
- * allocation when we detect linearly ascending requests.
- */
- ext4_fsblk_t last_alloc_physical_block;
-};
-
#define rsv_start rsv_window._rsv_start
#define rsv_end rsv_window._rsv_end
@@ -79,7 +47,7 @@ struct ext4_ext_cache {
};
/*
- * third extended file system inode data in memory
+ * fourth extended file system inode data in memory
*/
struct ext4_inode_info {
__le32 i_data[15]; /* unconverted */
@@ -97,11 +65,8 @@ struct ext4_inode_info {
ext4_group_t i_block_group;
__u32 i_state; /* Dynamic state flags for ext4 */
- /* block reservation info */
- struct ext4_block_alloc_info *i_block_alloc_info;
-
ext4_lblk_t i_dir_start_lookup;
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
/*
* Extended attributes can be read independently of the main file
* data. Taking i_mutex even when reading would cause contention
@@ -111,7 +76,7 @@ struct ext4_inode_info {
*/
struct rw_semaphore xattr_sem;
#endif
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
struct posix_acl *i_acl;
struct posix_acl *i_default_acl;
#endif
@@ -150,6 +115,7 @@ struct ext4_inode_info {
*/
struct rw_semaphore i_data_sem;
struct inode vfs_inode;
+ struct jbd2_inode jinode;
unsigned long i_ext_generation;
struct ext4_ext_cache i_cached_extent;
@@ -162,6 +128,13 @@ struct ext4_inode_info {
/* mballoc */
struct list_head i_prealloc_list;
spinlock_t i_prealloc_lock;
+
+ /* allocation reservation info for delalloc */
+ unsigned long i_reserved_data_blocks;
+ unsigned long i_reserved_meta_blocks;
+ unsigned long i_allocated_meta_blocks;
+ unsigned short i_delalloc_reserved_flag;
+ spinlock_t i_block_reservation_lock;
};
#endif /* _EXT4_I */
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 9255a7d28b2..b455c685a98 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -51,6 +51,14 @@
EXT4_XATTR_TRANS_BLOCKS - 2 + \
2*EXT4_QUOTA_TRANS_BLOCKS(sb))
+/*
+ * Define the number of metadata blocks we need to account to modify data.
+ *
+ * This include super block, inode block, quota blocks and xattr blocks
+ */
+#define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \
+ 2*EXT4_QUOTA_TRANS_BLOCKS(sb))
+
/* Delete operations potentially hit one directory's namespace plus an
* entire inode, plus arbitrary amounts of bitmap/indirection data. Be
* generous. We can grow the delete transaction later if necessary. */
@@ -142,19 +150,17 @@ int __ext4_journal_dirty_metadata(const char *where,
handle_t *handle, struct buffer_head *bh);
#define ext4_journal_get_undo_access(handle, bh) \
- __ext4_journal_get_undo_access(__FUNCTION__, (handle), (bh))
+ __ext4_journal_get_undo_access(__func__, (handle), (bh))
#define ext4_journal_get_write_access(handle, bh) \
- __ext4_journal_get_write_access(__FUNCTION__, (handle), (bh))
+ __ext4_journal_get_write_access(__func__, (handle), (bh))
#define ext4_journal_revoke(handle, blocknr, bh) \
- __ext4_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh))
+ __ext4_journal_revoke(__func__, (handle), (blocknr), (bh))
#define ext4_journal_get_create_access(handle, bh) \
- __ext4_journal_get_create_access(__FUNCTION__, (handle), (bh))
+ __ext4_journal_get_create_access(__func__, (handle), (bh))
#define ext4_journal_dirty_metadata(handle, bh) \
- __ext4_journal_dirty_metadata(__FUNCTION__, (handle), (bh))
+ __ext4_journal_dirty_metadata(__func__, (handle), (bh))
#define ext4_journal_forget(handle, bh) \
- __ext4_journal_forget(__FUNCTION__, (handle), (bh))
-
-int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh);
+ __ext4_journal_forget(__func__, (handle), (bh))
handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
int __ext4_journal_stop(const char *where, handle_t *handle);
@@ -165,7 +171,7 @@ static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
}
#define ext4_journal_stop(handle) \
- __ext4_journal_stop(__FUNCTION__, (handle))
+ __ext4_journal_stop(__func__, (handle))
static inline handle_t *ext4_journal_current_handle(void)
{
@@ -192,6 +198,11 @@ static inline int ext4_journal_force_commit(journal_t *journal)
return jbd2_journal_force_commit(journal);
}
+static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
+{
+ return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
+}
+
/* super.c */
int ext4_force_commit(struct super_block *sb);
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 5802e69f219..6a0b40d4326 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -25,7 +25,7 @@
#include <linux/rbtree.h>
/*
- * third extended-fs super-block data in memory
+ * fourth extended-fs super-block data in memory
*/
struct ext4_sb_info {
unsigned long s_desc_size; /* Size of a group descriptor in bytes */
@@ -40,8 +40,8 @@ struct ext4_sb_info {
unsigned long s_blocks_last; /* Last seen block count */
loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */
struct buffer_head * s_sbh; /* Buffer containing the super block */
- struct ext4_super_block * s_es; /* Pointer to the super block in the buffer */
- struct buffer_head ** s_group_desc;
+ struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */
+ struct buffer_head **s_group_desc;
unsigned long s_mount_opt;
ext4_fsblk_t s_sb_block;
uid_t s_resuid;
@@ -52,6 +52,7 @@ struct ext4_sb_info {
int s_desc_per_block_bits;
int s_inode_size;
int s_first_ino;
+ unsigned int s_inode_readahead_blks;
spinlock_t s_next_gen_lock;
u32 s_next_generation;
u32 s_hash_seed[4];
@@ -59,16 +60,17 @@ struct ext4_sb_info {
struct percpu_counter s_freeblocks_counter;
struct percpu_counter s_freeinodes_counter;
struct percpu_counter s_dirs_counter;
+ struct percpu_counter s_dirtyblocks_counter;
struct blockgroup_lock s_blockgroup_lock;
+ struct proc_dir_entry *s_proc;
/* root of the per fs reservation window tree */
spinlock_t s_rsv_window_lock;
struct rb_root s_rsv_window_root;
- struct ext4_reserve_window_node s_rsv_window_head;
/* Journaling */
- struct inode * s_journal_inode;
- struct journal_s * s_journal;
+ struct inode *s_journal_inode;
+ struct journal_s *s_journal;
struct list_head s_orphan;
unsigned long s_commit_interval;
struct block_device *journal_bdev;
@@ -106,12 +108,12 @@ struct ext4_sb_info {
/* tunables */
unsigned long s_stripe;
- unsigned long s_mb_stream_request;
- unsigned long s_mb_max_to_scan;
- unsigned long s_mb_min_to_scan;
- unsigned long s_mb_stats;
- unsigned long s_mb_order2_reqs;
- unsigned long s_mb_group_prealloc;
+ unsigned int s_mb_stream_request;
+ unsigned int s_mb_max_to_scan;
+ unsigned int s_mb_min_to_scan;
+ unsigned int s_mb_stats;
+ unsigned int s_mb_order2_reqs;
+ unsigned int s_mb_group_prealloc;
/* where last allocation was done - for stream allocation */
unsigned long s_mb_last_group;
unsigned long s_mb_last_start;
@@ -121,7 +123,6 @@ struct ext4_sb_info {
int s_mb_history_cur;
int s_mb_history_max;
int s_mb_history_num;
- struct proc_dir_entry *s_mb_proc;
spinlock_t s_mb_history_lock;
int s_mb_history_filter;
@@ -143,6 +144,9 @@ struct ext4_sb_info {
/* locality groups */
struct ext4_locality_group *s_locality_groups;
+
+ unsigned int s_log_groups_per_flex;
+ struct flex_groups *s_flex_groups;
};
#endif /* _EXT4_SB */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 47929c4e3da..ea2ce3c0ae6 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -40,6 +40,7 @@
#include <linux/slab.h>
#include <linux/falloc.h>
#include <asm/uaccess.h>
+#include <linux/fiemap.h>
#include "ext4_jbd2.h"
#include "ext4_extents.h"
@@ -92,17 +93,16 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
}
-static handle_t *ext4_ext_journal_restart(handle_t *handle, int needed)
+static int ext4_ext_journal_restart(handle_t *handle, int needed)
{
int err;
if (handle->h_buffer_credits > needed)
- return handle;
- if (!ext4_journal_extend(handle, needed))
- return handle;
- err = ext4_journal_restart(handle, needed);
-
- return handle;
+ return 0;
+ err = ext4_journal_extend(handle, needed);
+ if (err <= 0)
+ return err;
+ return ext4_journal_restart(handle, needed);
}
/*
@@ -180,15 +180,18 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
return bg_start + colour + block;
}
+/*
+ * Allocation for a meta data block
+ */
static ext4_fsblk_t
-ext4_ext_new_block(handle_t *handle, struct inode *inode,
+ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path,
struct ext4_extent *ex, int *err)
{
ext4_fsblk_t goal, newblock;
goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
- newblock = ext4_new_block(handle, inode, goal, err);
+ newblock = ext4_new_meta_block(handle, inode, goal, err);
return newblock;
}
@@ -246,6 +249,36 @@ static int ext4_ext_space_root_idx(struct inode *inode)
return size;
}
+/*
+ * Calculate the number of metadata blocks needed
+ * to allocate @blocks
+ * Worse case is one block per extent
+ */
+int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
+{
+ int lcap, icap, rcap, leafs, idxs, num;
+ int newextents = blocks;
+
+ rcap = ext4_ext_space_root_idx(inode);
+ lcap = ext4_ext_space_block(inode);
+ icap = ext4_ext_space_block_idx(inode);
+
+ /* number of new leaf blocks needed */
+ num = leafs = (newextents + lcap - 1) / lcap;
+
+ /*
+ * Worse case, we need separate index block(s)
+ * to link all new leaf blocks
+ */
+ idxs = (leafs + icap - 1) / icap;
+ do {
+ num += idxs;
+ idxs = (idxs + icap - 1) / icap;
+ } while (idxs > rcap);
+
+ return num;
+}
+
static int
ext4_ext_max_entries(struct inode *inode, int depth)
{
@@ -351,8 +384,8 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
ext_debug("\n");
}
#else
-#define ext4_ext_show_path(inode,path)
-#define ext4_ext_show_leaf(inode,path)
+#define ext4_ext_show_path(inode, path)
+#define ext4_ext_show_leaf(inode, path)
#endif
void ext4_ext_drop_refs(struct ext4_ext_path *path)
@@ -408,9 +441,10 @@ ext4_ext_binsearch_idx(struct inode *inode,
for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) {
if (k != 0 &&
le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) {
- printk("k=%d, ix=0x%p, first=0x%p\n", k,
- ix, EXT_FIRST_INDEX(eh));
- printk("%u <= %u\n",
+ printk(KERN_DEBUG "k=%d, ix=0x%p, "
+ "first=0x%p\n", k,
+ ix, EXT_FIRST_INDEX(eh));
+ printk(KERN_DEBUG "%u <= %u\n",
le32_to_cpu(ix->ei_block),
le32_to_cpu(ix[-1].ei_block));
}
@@ -524,6 +558,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
alloc = 1;
}
path[0].p_hdr = eh;
+ path[0].p_bh = NULL;
i = depth;
/* walk through the tree */
@@ -552,12 +587,14 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
}
path[ppos].p_depth = i;
- path[ppos].p_hdr = eh;
path[ppos].p_ext = NULL;
path[ppos].p_idx = NULL;
/* find extent */
ext4_ext_binsearch(inode, path + ppos, block);
+ /* if not an empty leaf */
+ if (path[ppos].p_ext)
+ path[ppos].p_block = ext_pblock(path[ppos].p_ext);
ext4_ext_show_path(inode, path);
@@ -688,7 +725,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
/* allocate all needed blocks */
ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
for (a = 0; a < depth - at; a++) {
- newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
+ newblock = ext4_ext_new_meta_block(handle, inode, path,
+ newext, &err);
if (newblock == 0)
goto cleanup;
ablocks[a] = newblock;
@@ -884,7 +922,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
ext4_fsblk_t newblock;
int err = 0;
- newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
+ newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err);
if (newblock == 0)
return err;
@@ -981,6 +1019,8 @@ repeat:
/* if we found index with free entry, then use that
* entry: create all needed subtree and add new leaf */
err = ext4_ext_split(handle, inode, path, newext, i);
+ if (err)
+ goto out;
/* refill path */
ext4_ext_drop_refs(path);
@@ -1403,7 +1443,7 @@ unsigned int ext4_ext_check_overlap(struct inode *inode,
/*
* get the next allocated block if the extent in the path
- * is before the requested block(s)
+ * is before the requested block(s)
*/
if (b2 < b1) {
b2 = ext4_ext_next_allocated_block(path);
@@ -1437,7 +1477,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path,
struct ext4_extent *newext)
{
- struct ext4_extent_header * eh;
+ struct ext4_extent_header *eh;
struct ext4_extent *ex, *fex;
struct ext4_extent *nearex; /* nearest extent */
struct ext4_ext_path *npath = NULL;
@@ -1587,6 +1627,113 @@ cleanup:
return err;
}
+int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
+ ext4_lblk_t num, ext_prepare_callback func,
+ void *cbdata)
+{
+ struct ext4_ext_path *path = NULL;
+ struct ext4_ext_cache cbex;
+ struct ext4_extent *ex;
+ ext4_lblk_t next, start = 0, end = 0;
+ ext4_lblk_t last = block + num;
+ int depth, exists, err = 0;
+
+ BUG_ON(func == NULL);
+ BUG_ON(inode == NULL);
+
+ while (block < last && block != EXT_MAX_BLOCK) {
+ num = last - block;
+ /* find extent for this block */
+ path = ext4_ext_find_extent(inode, block, path);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
+ path = NULL;
+ break;
+ }
+
+ depth = ext_depth(inode);
+ BUG_ON(path[depth].p_hdr == NULL);
+ ex = path[depth].p_ext;
+ next = ext4_ext_next_allocated_block(path);
+
+ exists = 0;
+ if (!ex) {
+ /* there is no extent yet, so try to allocate
+ * all requested space */
+ start = block;
+ end = block + num;
+ } else if (le32_to_cpu(ex->ee_block) > block) {
+ /* need to allocate space before found extent */
+ start = block;
+ end = le32_to_cpu(ex->ee_block);
+ if (block + num < end)
+ end = block + num;
+ } else if (block >= le32_to_cpu(ex->ee_block)
+ + ext4_ext_get_actual_len(ex)) {
+ /* need to allocate space after found extent */
+ start = block;
+ end = block + num;
+ if (end >= next)
+ end = next;
+ } else if (block >= le32_to_cpu(ex->ee_block)) {
+ /*
+ * some part of requested space is covered
+ * by found extent
+ */
+ start = block;
+ end = le32_to_cpu(ex->ee_block)
+ + ext4_ext_get_actual_len(ex);
+ if (block + num < end)
+ end = block + num;
+ exists = 1;
+ } else {
+ BUG();
+ }
+ BUG_ON(end <= start);
+
+ if (!exists) {
+ cbex.ec_block = start;
+ cbex.ec_len = end - start;
+ cbex.ec_start = 0;
+ cbex.ec_type = EXT4_EXT_CACHE_GAP;
+ } else {
+ cbex.ec_block = le32_to_cpu(ex->ee_block);
+ cbex.ec_len = ext4_ext_get_actual_len(ex);
+ cbex.ec_start = ext_pblock(ex);
+ cbex.ec_type = EXT4_EXT_CACHE_EXTENT;
+ }
+
+ BUG_ON(cbex.ec_len == 0);
+ err = func(inode, path, &cbex, ex, cbdata);
+ ext4_ext_drop_refs(path);
+
+ if (err < 0)
+ break;
+
+ if (err == EXT_REPEAT)
+ continue;
+ else if (err == EXT_BREAK) {
+ err = 0;
+ break;
+ }
+
+ if (ext_depth(inode) != depth) {
+ /* depth was changed. we have to realloc path */
+ kfree(path);
+ path = NULL;
+ }
+
+ block = cbex.ec_block + cbex.ec_len;
+ }
+
+ if (path) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ }
+
+ return err;
+}
+
static void
ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block,
__u32 len, ext4_fsblk_t start, int type)
@@ -1709,54 +1856,61 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
}
/*
- * ext4_ext_calc_credits_for_insert:
- * This routine returns max. credits that the extent tree can consume.
- * It should be OK for low-performance paths like ->writepage()
- * To allow many writing processes to fit into a single transaction,
- * the caller should calculate credits under i_data_sem and
- * pass the actual path.
+ * ext4_ext_calc_credits_for_single_extent:
+ * This routine returns max. credits that needed to insert an extent
+ * to the extent tree.
+ * When pass the actual path, the caller should calculate credits
+ * under i_data_sem.
*/
-int ext4_ext_calc_credits_for_insert(struct inode *inode,
+int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
struct ext4_ext_path *path)
{
- int depth, needed;
-
if (path) {
+ int depth = ext_depth(inode);
+ int ret = 0;
+
/* probably there is space in leaf? */
- depth = ext_depth(inode);
if (le16_to_cpu(path[depth].p_hdr->eh_entries)
- < le16_to_cpu(path[depth].p_hdr->eh_max))
- return 1;
- }
-
- /*
- * given 32-bit logical block (4294967296 blocks), max. tree
- * can be 4 levels in depth -- 4 * 340^4 == 53453440000.
- * Let's also add one more level for imbalance.
- */
- depth = 5;
+ < le16_to_cpu(path[depth].p_hdr->eh_max)) {
- /* allocation of new data block(s) */
- needed = 2;
+ /*
+ * There are some space in the leaf tree, no
+ * need to account for leaf block credit
+ *
+ * bitmaps and block group descriptor blocks
+ * and other metadat blocks still need to be
+ * accounted.
+ */
+ /* 1 bitmap, 1 block group descriptor */
+ ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
+ }
+ }
- /*
- * tree can be full, so it would need to grow in depth:
- * we need one credit to modify old root, credits for
- * new root will be added in split accounting
- */
- needed += 1;
+ return ext4_chunk_trans_blocks(inode, nrblocks);
+}
- /*
- * Index split can happen, we would need:
- * allocate intermediate indexes (bitmap + group)
- * + change two blocks at each level, but root (already included)
- */
- needed += (depth * 2) + (depth * 2);
+/*
+ * How many index/leaf blocks need to change/allocate to modify nrblocks?
+ *
+ * if nrblocks are fit in a single extent (chunk flag is 1), then
+ * in the worse case, each tree level index/leaf need to be changed
+ * if the tree split due to insert a new extent, then the old tree
+ * index/leaf need to be updated too
+ *
+ * If the nrblocks are discontiguous, they could cause
+ * the whole tree split more than once, but this is really rare.
+ */
+int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+{
+ int index;
+ int depth = ext_depth(inode);
- /* any allocation modifies superblock */
- needed += 1;
+ if (chunk)
+ index = depth * 2;
+ else
+ index = depth * 3;
- return needed;
+ return index;
}
static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
@@ -1872,22 +2026,22 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
BUG_ON(b != ex_ee_block + ex_ee_len - 1);
}
- /* at present, extent can't cross block group: */
- /* leaf + bitmap + group desc + sb + inode */
- credits = 5;
+ /*
+ * 3 for leaf, sb, and inode plus 2 (bmap and group
+ * descriptor) for each block group; assume two block
+ * groups plus ex_ee_len/blocks_per_block_group for
+ * the worst case
+ */
+ credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
if (ex == EXT_FIRST_EXTENT(eh)) {
correct_index = 1;
credits += (ext_depth(inode)) + 1;
}
-#ifdef CONFIG_QUOTA
credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
-#endif
- handle = ext4_ext_journal_restart(handle, credits);
- if (IS_ERR(handle)) {
- err = PTR_ERR(handle);
+ err = ext4_ext_journal_restart(handle, credits);
+ if (err)
goto out;
- }
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
@@ -2097,7 +2251,7 @@ void ext4_ext_init(struct super_block *sb)
*/
if (test_opt(sb, EXTENTS)) {
- printk("EXT4-fs: file extents enabled");
+ printk(KERN_INFO "EXT4-fs: file extents enabled");
#ifdef AGGRESSIVE_TEST
printk(", aggressive tests");
#endif
@@ -2287,7 +2441,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
unsigned int newdepth;
/* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
if (allocated <= EXT4_EXT_ZERO_LEN) {
- /* Mark first half uninitialized.
+ /*
+ * iblock == ee_block is handled by the zerouout
+ * at the beginning.
+ * Mark first half uninitialized.
* Mark second half initialized and zero out the
* initialized extent
*/
@@ -2310,7 +2467,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
ex->ee_len = orig_ex.ee_len;
ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
ext4_ext_dirty(handle, inode, path + depth);
- /* zeroed the full extent */
+ /* blocks available from iblock */
return allocated;
} else if (err)
@@ -2338,6 +2495,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
err = PTR_ERR(path);
return err;
}
+ /* get the second half extent details */
ex = path[depth].p_ext;
err = ext4_ext_get_access(handle, inode,
path + depth);
@@ -2367,6 +2525,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
ext4_ext_dirty(handle, inode, path + depth);
/* zeroed the full extent */
+ /* blocks available from iblock */
return allocated;
} else if (err)
@@ -2382,23 +2541,22 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
*/
orig_ex.ee_len = cpu_to_le16(ee_len -
ext4_ext_get_actual_len(ex3));
- if (newdepth != depth) {
- depth = newdepth;
- ext4_ext_drop_refs(path);
- path = ext4_ext_find_extent(inode, iblock, path);
- if (IS_ERR(path)) {
- err = PTR_ERR(path);
- goto out;
- }
- eh = path[depth].p_hdr;
- ex = path[depth].p_ext;
- if (ex2 != &newex)
- ex2 = ex;
-
- err = ext4_ext_get_access(handle, inode, path + depth);
- if (err)
- goto out;
+ depth = newdepth;
+ ext4_ext_drop_refs(path);
+ path = ext4_ext_find_extent(inode, iblock, path);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
+ goto out;
}
+ eh = path[depth].p_hdr;
+ ex = path[depth].p_ext;
+ if (ex2 != &newex)
+ ex2 = ex;
+
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
+
allocated = max_blocks;
/* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
@@ -2416,6 +2574,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
ext4_ext_dirty(handle, inode, path + depth);
/* zero out the first half */
+ /* blocks available from iblock */
return allocated;
}
}
@@ -2529,6 +2688,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
int err = 0, depth, ret;
unsigned long allocated = 0;
struct ext4_allocation_request ar;
+ loff_t disksize;
__clear_bit(BH_New, &bh_result->b_state);
ext_debug("blocks %u/%lu requested for inode %u\n",
@@ -2616,8 +2776,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
*/
if (allocated > max_blocks)
allocated = max_blocks;
- /* mark the buffer unwritten */
- __set_bit(BH_Unwritten, &bh_result->b_state);
+ set_buffer_unwritten(bh_result);
goto out2;
}
@@ -2646,11 +2805,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
goto out2;
}
/*
- * Okay, we need to do block allocation. Lazily initialize the block
- * allocation info here if necessary.
+ * Okay, we need to do block allocation.
*/
- if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info))
- ext4_init_block_alloc_info(inode);
/* find neighbour allocated blocks */
ar.lleft = iblock;
@@ -2710,20 +2866,25 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
/* free data blocks we just allocated */
/* not a good idea to call discard here directly,
* but otherwise we'd need to call it every free() */
- ext4_mb_discard_inode_preallocations(inode);
+ ext4_discard_preallocations(inode);
ext4_free_blocks(handle, inode, ext_pblock(&newex),
ext4_ext_get_actual_len(&newex), 0);
goto out2;
}
- if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize)
- EXT4_I(inode)->i_disksize = inode->i_size;
-
/* previous routine could use block we allocated */
newblock = ext_pblock(&newex);
allocated = ext4_ext_get_actual_len(&newex);
outnew:
- __set_bit(BH_New, &bh_result->b_state);
+ if (extend_disksize) {
+ disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits;
+ if (disksize > i_size_read(inode))
+ disksize = i_size_read(inode);
+ if (disksize > EXT4_I(inode)->i_disksize)
+ EXT4_I(inode)->i_disksize = disksize;
+ }
+
+ set_buffer_new(bh_result);
/* Cache only when it is _not_ an uninitialized extent */
if (create != EXT4_CREATE_UNINITIALIZED_EXT)
@@ -2733,7 +2894,7 @@ out:
if (allocated > max_blocks)
allocated = max_blocks;
ext4_ext_show_leaf(inode, path);
- __set_bit(BH_Mapped, &bh_result->b_state);
+ set_buffer_mapped(bh_result);
bh_result->b_bdev = inode->i_sb->s_bdev;
bh_result->b_blocknr = newblock;
out2:
@@ -2744,7 +2905,7 @@ out2:
return err ? err : allocated;
}
-void ext4_ext_truncate(struct inode * inode, struct page *page)
+void ext4_ext_truncate(struct inode *inode)
{
struct address_space *mapping = inode->i_mapping;
struct super_block *sb = inode->i_sb;
@@ -2755,33 +2916,27 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
/*
* probably first extent we're gonna free will be last in block
*/
- err = ext4_writepage_trans_blocks(inode) + 3;
+ err = ext4_writepage_trans_blocks(inode);
handle = ext4_journal_start(inode, err);
- if (IS_ERR(handle)) {
- if (page) {
- clear_highpage(page);
- flush_dcache_page(page);
- unlock_page(page);
- page_cache_release(page);
- }
+ if (IS_ERR(handle))
return;
- }
- if (page)
- ext4_block_truncate_page(handle, page, mapping, inode->i_size);
+ if (inode->i_size & (sb->s_blocksize - 1))
+ ext4_block_truncate_page(handle, mapping, inode->i_size);
+
+ if (ext4_orphan_add(handle, inode))
+ goto out_stop;
down_write(&EXT4_I(inode)->i_data_sem);
ext4_ext_invalidate_cache(inode);
- ext4_mb_discard_inode_preallocations(inode);
+ ext4_discard_preallocations(inode);
/*
* TODO: optimization is possible here.
* Probably we need not scan at all,
* because page truncation is enough.
*/
- if (ext4_orphan_add(handle, inode))
- goto out_stop;
/* we have to know where to truncate from in crash case */
EXT4_I(inode)->i_disksize = inode->i_size;
@@ -2798,6 +2953,7 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
handle->h_sync = 1;
out_stop:
+ up_write(&EXT4_I(inode)->i_data_sem);
/*
* If this was a simple ftruncate() and the file will remain alive,
* then we need to clear up the orphan record which we created above.
@@ -2808,33 +2964,11 @@ out_stop:
if (inode->i_nlink)
ext4_orphan_del(handle, inode);
- up_write(&EXT4_I(inode)->i_data_sem);
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
}
-/*
- * ext4_ext_writepage_trans_blocks:
- * calculate max number of blocks we could modify
- * in order to allocate new block for an inode
- */
-int ext4_ext_writepage_trans_blocks(struct inode *inode, int num)
-{
- int needed;
-
- needed = ext4_ext_calc_credits_for_insert(inode, NULL);
-
- /* caller wants to allocate num blocks, but note it includes sb */
- needed = needed * num - (num - 1);
-
-#ifdef CONFIG_QUOTA
- needed += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
-#endif
-
- return needed;
-}
-
static void ext4_falloc_update_inode(struct inode *inode,
int mode, loff_t new_size, int update_ctime)
{
@@ -2849,10 +2983,11 @@ static void ext4_falloc_update_inode(struct inode *inode,
* Update only when preallocation was requested beyond
* the file size.
*/
- if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- new_size > i_size_read(inode)) {
- i_size_write(inode, new_size);
- EXT4_I(inode)->i_disksize = new_size;
+ if (!(mode & FALLOC_FL_KEEP_SIZE)) {
+ if (new_size > i_size_read(inode))
+ i_size_write(inode, new_size);
+ if (new_size > EXT4_I(inode)->i_disksize)
+ ext4_update_i_disksize(inode, new_size);
}
}
@@ -2895,10 +3030,9 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
- block;
/*
- * credits to insert 1 extent into extent tree + buffers to be able to
- * modify 1 super block, 1 block bitmap and 1 group descriptor.
+ * credits to insert 1 extent into extent tree
*/
- credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3;
+ credits = ext4_chunk_trans_blocks(inode, max_blocks);
mutex_lock(&inode->i_mutex);
retry:
while (ret >= 0 && ret < max_blocks) {
@@ -2911,7 +3045,7 @@ retry:
}
ret = ext4_get_blocks_wrap(handle, inode, block,
max_blocks, &map_bh,
- EXT4_CREATE_UNINITIALIZED_EXT, 0);
+ EXT4_CREATE_UNINITIALIZED_EXT, 0, 0);
if (ret <= 0) {
#ifdef EXT4FS_DEBUG
WARN_ON(ret <= 0);
@@ -2945,3 +3079,143 @@ retry:
mutex_unlock(&inode->i_mutex);
return ret > 0 ? ret2 : ret;
}
+
+/*
+ * Callback function called for each extent to gather FIEMAP information.
+ */
+int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
+ struct ext4_ext_cache *newex, struct ext4_extent *ex,
+ void *data)
+{
+ struct fiemap_extent_info *fieinfo = data;
+ unsigned long blksize_bits = inode->i_sb->s_blocksize_bits;
+ __u64 logical;
+ __u64 physical;
+ __u64 length;
+ __u32 flags = 0;
+ int error;
+
+ logical = (__u64)newex->ec_block << blksize_bits;
+
+ if (newex->ec_type == EXT4_EXT_CACHE_GAP) {
+ pgoff_t offset;
+ struct page *page;
+ struct buffer_head *bh = NULL;
+
+ offset = logical >> PAGE_SHIFT;
+ page = find_get_page(inode->i_mapping, offset);
+ if (!page || !page_has_buffers(page))
+ return EXT_CONTINUE;
+
+ bh = page_buffers(page);
+
+ if (!bh)
+ return EXT_CONTINUE;
+
+ if (buffer_delay(bh)) {
+ flags |= FIEMAP_EXTENT_DELALLOC;
+ page_cache_release(page);
+ } else {
+ page_cache_release(page);
+ return EXT_CONTINUE;
+ }
+ }
+
+ physical = (__u64)newex->ec_start << blksize_bits;
+ length = (__u64)newex->ec_len << blksize_bits;
+
+ if (ex && ext4_ext_is_uninitialized(ex))
+ flags |= FIEMAP_EXTENT_UNWRITTEN;
+
+ /*
+ * If this extent reaches EXT_MAX_BLOCK, it must be last.
+ *
+ * Or if ext4_ext_next_allocated_block is EXT_MAX_BLOCK,
+ * this also indicates no more allocated blocks.
+ *
+ * XXX this might miss a single-block extent at EXT_MAX_BLOCK
+ */
+ if (logical + length - 1 == EXT_MAX_BLOCK ||
+ ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK)
+ flags |= FIEMAP_EXTENT_LAST;
+
+ error = fiemap_fill_next_extent(fieinfo, logical, physical,
+ length, flags);
+ if (error < 0)
+ return error;
+ if (error == 1)
+ return EXT_BREAK;
+
+ return EXT_CONTINUE;
+}
+
+/* fiemap flags we can handle specified here */
+#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
+
+int ext4_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo)
+{
+ __u64 physical = 0;
+ __u64 length;
+ __u32 flags = FIEMAP_EXTENT_LAST;
+ int blockbits = inode->i_sb->s_blocksize_bits;
+ int error = 0;
+
+ /* in-inode? */
+ if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) {
+ struct ext4_iloc iloc;
+ int offset; /* offset of xattr in inode */
+
+ error = ext4_get_inode_loc(inode, &iloc);
+ if (error)
+ return error;
+ physical = iloc.bh->b_blocknr << blockbits;
+ offset = EXT4_GOOD_OLD_INODE_SIZE +
+ EXT4_I(inode)->i_extra_isize;
+ physical += offset;
+ length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
+ flags |= FIEMAP_EXTENT_DATA_INLINE;
+ } else { /* external block */
+ physical = EXT4_I(inode)->i_file_acl << blockbits;
+ length = inode->i_sb->s_blocksize;
+ }
+
+ if (physical)
+ error = fiemap_fill_next_extent(fieinfo, 0, physical,
+ length, flags);
+ return (error < 0 ? error : 0);
+}
+
+int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ __u64 start, __u64 len)
+{
+ ext4_lblk_t start_blk;
+ ext4_lblk_t len_blks;
+ int error = 0;
+
+ /* fallback to generic here if not in extents fmt */
+ if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ return generic_block_fiemap(inode, fieinfo, start, len,
+ ext4_get_block);
+
+ if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
+ return -EBADR;
+
+ if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
+ error = ext4_xattr_fiemap(inode, fieinfo);
+ } else {
+ start_blk = start >> inode->i_sb->s_blocksize_bits;
+ len_blks = len >> inode->i_sb->s_blocksize_bits;
+
+ /*
+ * Walk the extent tree gathering extent information.
+ * ext4_ext_fiemap_cb will push extents back to user.
+ */
+ down_write(&EXT4_I(inode)->i_data_sem);
+ error = ext4_ext_walk_space(inode, start_blk, len_blks,
+ ext4_ext_fiemap_cb, fieinfo);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ }
+
+ return error;
+}
+
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 4159be6366a..6bd11fba71f 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -31,14 +31,14 @@
* from ext4_file_open: open gets called at every open, but release
* gets called only when /all/ the files are closed.
*/
-static int ext4_release_file (struct inode * inode, struct file * filp)
+static int ext4_release_file(struct inode *inode, struct file *filp)
{
/* if we are the last writer on the inode, drop the block reservation */
if ((filp->f_mode & FMODE_WRITE) &&
(atomic_read(&inode->i_writecount) == 1))
{
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_reservation(inode);
+ ext4_discard_preallocations(inode);
up_write(&EXT4_I(inode)->i_data_sem);
}
if (is_dx(inode) && filp->private_data)
@@ -123,6 +123,26 @@ force_commit:
return ret;
}
+static struct vm_operations_struct ext4_file_vm_ops = {
+ .fault = filemap_fault,
+ .page_mkwrite = ext4_page_mkwrite,
+};
+
+static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct address_space *mapping = file->f_mapping;
+
+ if (!mapping->a_ops->readpage)
+ return -ENOEXEC;
+ file_accessed(file);
+ vma->vm_ops = &ext4_file_vm_ops;
+ vma->vm_flags |= VM_CAN_NONLINEAR;
+ return 0;
+}
+
+extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ __u64 start, __u64 len);
+
const struct file_operations ext4_file_operations = {
.llseek = generic_file_llseek,
.read = do_sync_read,
@@ -133,7 +153,7 @@ const struct file_operations ext4_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = ext4_compat_ioctl,
#endif
- .mmap = generic_file_mmap,
+ .mmap = ext4_file_mmap,
.open = generic_file_open,
.release = ext4_release_file,
.fsync = ext4_sync_file,
@@ -144,7 +164,8 @@ const struct file_operations ext4_file_operations = {
const struct inode_operations ext4_file_inode_operations = {
.truncate = ext4_truncate,
.setattr = ext4_setattr,
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+ .getattr = ext4_getattr,
+#ifdef CONFIG_EXT4_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
@@ -152,5 +173,6 @@ const struct inode_operations ext4_file_inode_operations = {
#endif
.permission = ext4_permission,
.fallocate = ext4_fallocate,
+ .fiemap = ext4_fiemap,
};
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 1c8ba48d4f8..5afe4370840 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -27,6 +27,8 @@
#include <linux/sched.h>
#include <linux/writeback.h>
#include <linux/jbd2.h>
+#include <linux/blkdev.h>
+#include <linux/marker.h>
#include "ext4.h"
#include "ext4_jbd2.h"
@@ -42,13 +44,18 @@
* inode to disk.
*/
-int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
+int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
{
struct inode *inode = dentry->d_inode;
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
int ret = 0;
J_ASSERT(ext4_journal_current_handle() == NULL);
+ trace_mark(ext4_sync_file, "dev %s datasync %d ino %ld parent %ld",
+ inode->i_sb->s_id, datasync, inode->i_ino,
+ dentry->d_parent->d_inode->i_ino);
+
/*
* data=writeback:
* The caller's filemap_fdatawrite()/wait will sync the data.
@@ -85,6 +92,8 @@ int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
.nr_to_write = 0, /* sys_fsync did this */
};
ret = sync_inode(inode, &wbc);
+ if (journal && (journal->j_flags & JBD2_BARRIER))
+ blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
}
out:
return ret;
diff --git a/fs/ext4/group.h b/fs/ext4/group.h
index 7eb0604e7ee..c2c0a8d06d0 100644
--- a/fs/ext4/group.h
+++ b/fs/ext4/group.h
@@ -13,7 +13,7 @@ extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
struct ext4_group_desc *gdp);
extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
struct ext4_group_desc *gdp);
-struct buffer_head *read_block_bitmap(struct super_block *sb,
+struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
ext4_group_t block_group);
extern unsigned ext4_init_block_bitmap(struct super_block *sb,
struct buffer_head *bh,
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 1d6329dbe39..556ca8eba3d 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -27,7 +27,7 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
sum += DELTA;
b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
- } while(--n);
+ } while (--n);
buf[0] += b0;
buf[1] += b1;
@@ -35,7 +35,7 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
/* The old legacy hash */
-static __u32 dx_hack_hash (const char *name, int len)
+static __u32 dx_hack_hash(const char *name, int len)
{
__u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
while (len--) {
@@ -59,7 +59,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
val = pad;
if (len > num*4)
len = num * 4;
- for (i=0; i < len; i++) {
+ for (i = 0; i < len; i++) {
if ((i % 4) == 0)
val = pad;
val = msg[i] + (val << 8);
@@ -104,7 +104,7 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
/* Check to see if the seed is all zero's */
if (hinfo->seed) {
- for (i=0; i < 4; i++) {
+ for (i = 0; i < 4; i++) {
if (hinfo->seed[i])
break;
}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index c6efbab0c80..fe34d74cfb1 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -97,34 +97,46 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
* Return buffer_head of bitmap on success or NULL.
*/
static struct buffer_head *
-read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
+ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
{
struct ext4_group_desc *desc;
struct buffer_head *bh = NULL;
+ ext4_fsblk_t bitmap_blk;
desc = ext4_get_group_desc(sb, block_group, NULL);
if (!desc)
- goto error_out;
+ return NULL;
+ bitmap_blk = ext4_inode_bitmap(sb, desc);
+ bh = sb_getblk(sb, bitmap_blk);
+ if (unlikely(!bh)) {
+ ext4_error(sb, __func__,
+ "Cannot read inode bitmap - "
+ "block_group = %lu, inode_bitmap = %llu",
+ block_group, bitmap_blk);
+ return NULL;
+ }
+ if (buffer_uptodate(bh) &&
+ !(desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)))
+ return bh;
+
+ lock_buffer(bh);
+ spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
- bh = sb_getblk(sb, ext4_inode_bitmap(sb, desc));
- if (!buffer_uptodate(bh)) {
- lock_buffer(bh);
- if (!buffer_uptodate(bh)) {
- ext4_init_inode_bitmap(sb, bh, block_group,
- desc);
- set_buffer_uptodate(bh);
- }
- unlock_buffer(bh);
- }
- } else {
- bh = sb_bread(sb, ext4_inode_bitmap(sb, desc));
+ ext4_init_inode_bitmap(sb, bh, block_group, desc);
+ set_buffer_uptodate(bh);
+ unlock_buffer(bh);
+ spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+ return bh;
}
- if (!bh)
- ext4_error(sb, "read_inode_bitmap",
+ spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+ if (bh_submit_read(bh) < 0) {
+ put_bh(bh);
+ ext4_error(sb, __func__,
"Cannot read inode bitmap - "
"block_group = %lu, inode_bitmap = %llu",
- block_group, ext4_inode_bitmap(sb, desc));
-error_out:
+ block_group, bitmap_blk);
+ return NULL;
+ }
return bh;
}
@@ -144,38 +156,40 @@ error_out:
* though), and then we'd have two inodes sharing the
* same inode number and space on the harddisk.
*/
-void ext4_free_inode (handle_t *handle, struct inode * inode)
+void ext4_free_inode(handle_t *handle, struct inode *inode)
{
- struct super_block * sb = inode->i_sb;
+ struct super_block *sb = inode->i_sb;
int is_directory;
unsigned long ino;
struct buffer_head *bitmap_bh = NULL;
struct buffer_head *bh2;
ext4_group_t block_group;
unsigned long bit;
- struct ext4_group_desc * gdp;
- struct ext4_super_block * es;
+ struct ext4_group_desc *gdp;
+ struct ext4_super_block *es;
struct ext4_sb_info *sbi;
int fatal = 0, err;
+ ext4_group_t flex_group;
if (atomic_read(&inode->i_count) > 1) {
- printk ("ext4_free_inode: inode has count=%d\n",
- atomic_read(&inode->i_count));
+ printk(KERN_ERR "ext4_free_inode: inode has count=%d\n",
+ atomic_read(&inode->i_count));
return;
}
if (inode->i_nlink) {
- printk ("ext4_free_inode: inode has nlink=%d\n",
- inode->i_nlink);
+ printk(KERN_ERR "ext4_free_inode: inode has nlink=%d\n",
+ inode->i_nlink);
return;
}
if (!sb) {
- printk("ext4_free_inode: inode on nonexistent device\n");
+ printk(KERN_ERR "ext4_free_inode: inode on "
+ "nonexistent device\n");
return;
}
sbi = EXT4_SB(sb);
ino = inode->i_ino;
- ext4_debug ("freeing inode %lu\n", ino);
+ ext4_debug("freeing inode %lu\n", ino);
/*
* Note: we must free any quota before locking the superblock,
@@ -189,17 +203,17 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
is_directory = S_ISDIR(inode->i_mode);
/* Do this BEFORE marking the inode not in use or returning an error */
- clear_inode (inode);
+ clear_inode(inode);
es = EXT4_SB(sb)->s_es;
if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
- ext4_error (sb, "ext4_free_inode",
- "reserved or nonexistent inode %lu", ino);
+ ext4_error(sb, "ext4_free_inode",
+ "reserved or nonexistent inode %lu", ino);
goto error_return;
}
block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
- bitmap_bh = read_inode_bitmap(sb, block_group);
+ bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
if (!bitmap_bh)
goto error_return;
@@ -211,10 +225,10 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
/* Ok, now we can actually update the inode bitmaps.. */
if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
bit, bitmap_bh->b_data))
- ext4_error (sb, "ext4_free_inode",
- "bit already cleared for inode %lu", ino);
+ ext4_error(sb, "ext4_free_inode",
+ "bit already cleared for inode %lu", ino);
else {
- gdp = ext4_get_group_desc (sb, block_group, &bh2);
+ gdp = ext4_get_group_desc(sb, block_group, &bh2);
BUFFER_TRACE(bh2, "get_write_access");
fatal = ext4_journal_get_write_access(handle, bh2);
@@ -232,6 +246,12 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
if (is_directory)
percpu_counter_dec(&sbi->s_dirs_counter);
+ if (sbi->s_log_groups_per_flex) {
+ flex_group = ext4_flex_group(sbi, block_group);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_inodes++;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
}
BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
err = ext4_journal_dirty_metadata(handle, bh2);
@@ -270,7 +290,7 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
avefreei = freei / ngroups;
for (group = 0; group < ngroups; group++) {
- desc = ext4_get_group_desc (sb, group, NULL);
+ desc = ext4_get_group_desc(sb, group, NULL);
if (!desc || !desc->bg_free_inodes_count)
continue;
if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei)
@@ -286,6 +306,80 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
return ret;
}
+#define free_block_ratio 10
+
+static int find_group_flex(struct super_block *sb, struct inode *parent,
+ ext4_group_t *best_group)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_desc *desc;
+ struct buffer_head *bh;
+ struct flex_groups *flex_group = sbi->s_flex_groups;
+ ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
+ ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
+ ext4_group_t ngroups = sbi->s_groups_count;
+ int flex_size = ext4_flex_bg_size(sbi);
+ ext4_group_t best_flex = parent_fbg_group;
+ int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
+ int flexbg_free_blocks;
+ int flex_freeb_ratio;
+ ext4_group_t n_fbg_groups;
+ ext4_group_t i;
+
+ n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >>
+ sbi->s_log_groups_per_flex;
+
+find_close_to_parent:
+ flexbg_free_blocks = flex_group[best_flex].free_blocks;
+ flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
+ if (flex_group[best_flex].free_inodes &&
+ flex_freeb_ratio > free_block_ratio)
+ goto found_flexbg;
+
+ if (best_flex && best_flex == parent_fbg_group) {
+ best_flex--;
+ goto find_close_to_parent;
+ }
+
+ for (i = 0; i < n_fbg_groups; i++) {
+ if (i == parent_fbg_group || i == parent_fbg_group - 1)
+ continue;
+
+ flexbg_free_blocks = flex_group[i].free_blocks;
+ flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
+
+ if (flex_freeb_ratio > free_block_ratio &&
+ flex_group[i].free_inodes) {
+ best_flex = i;
+ goto found_flexbg;
+ }
+
+ if (flex_group[best_flex].free_inodes == 0 ||
+ (flex_group[i].free_blocks >
+ flex_group[best_flex].free_blocks &&
+ flex_group[i].free_inodes))
+ best_flex = i;
+ }
+
+ if (!flex_group[best_flex].free_inodes ||
+ !flex_group[best_flex].free_blocks)
+ return -1;
+
+found_flexbg:
+ for (i = best_flex * flex_size; i < ngroups &&
+ i < (best_flex + 1) * flex_size; i++) {
+ desc = ext4_get_group_desc(sb, i, &bh);
+ if (le16_to_cpu(desc->bg_free_inodes_count)) {
+ *best_group = i;
+ goto out;
+ }
+ }
+
+ return -1;
+out:
+ return 0;
+}
+
/*
* Orlov's allocator for directories.
*
@@ -485,22 +579,23 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
* For other inodes, search forward from the parent directory's block
* group to find a free inode.
*/
-struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
+struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
{
struct super_block *sb;
struct buffer_head *bitmap_bh = NULL;
struct buffer_head *bh2;
ext4_group_t group = 0;
unsigned long ino = 0;
- struct inode * inode;
- struct ext4_group_desc * gdp = NULL;
- struct ext4_super_block * es;
+ struct inode *inode;
+ struct ext4_group_desc *gdp = NULL;
+ struct ext4_super_block *es;
struct ext4_inode_info *ei;
struct ext4_sb_info *sbi;
int ret2, err = 0;
struct inode *ret;
ext4_group_t i;
int free = 0;
+ ext4_group_t flex_group;
/* Cannot create files in a deleted directory */
if (!dir || !dir->i_nlink)
@@ -514,14 +609,21 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
sbi = EXT4_SB(sb);
es = sbi->s_es;
+
+ if (sbi->s_log_groups_per_flex) {
+ ret2 = find_group_flex(sb, dir, &group);
+ goto got_group;
+ }
+
if (S_ISDIR(mode)) {
- if (test_opt (sb, OLDALLOC))
+ if (test_opt(sb, OLDALLOC))
ret2 = find_group_dir(sb, dir, &group);
else
ret2 = find_group_orlov(sb, dir, &group);
} else
ret2 = find_group_other(sb, dir, &group);
+got_group:
err = -ENOSPC;
if (ret2 == -1)
goto out;
@@ -534,7 +636,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
goto fail;
brelse(bitmap_bh);
- bitmap_bh = read_inode_bitmap(sb, group);
+ bitmap_bh = ext4_read_inode_bitmap(sb, group);
if (!bitmap_bh)
goto fail;
@@ -600,7 +702,7 @@ got:
/* We may have to initialize the block bitmap if it isn't already */
if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
- struct buffer_head *block_bh = read_block_bitmap(sb, group);
+ struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group);
BUFFER_TRACE(block_bh, "get block bitmap access");
err = ext4_journal_get_write_access(handle, block_bh);
@@ -639,7 +741,7 @@ got:
/* When marking the block group with
* ~EXT4_BG_INODE_UNINIT we don't want to depend
- * on the value of bg_itable_unsed even though
+ * on the value of bg_itable_unused even though
* mke2fs could have initialized the same for us.
* Instead we calculated the value below
*/
@@ -676,8 +778,15 @@ got:
percpu_counter_inc(&sbi->s_dirs_counter);
sb->s_dirt = 1;
+ if (sbi->s_log_groups_per_flex) {
+ flex_group = ext4_flex_group(sbi, group);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_inodes--;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
+
inode->i_uid = current->fsuid;
- if (test_opt (sb, GRPID))
+ if (test_opt(sb, GRPID))
inode->i_gid = dir->i_gid;
else if (dir->i_mode & S_ISGID) {
inode->i_gid = dir->i_gid;
@@ -710,7 +819,6 @@ got:
ei->i_flags &= ~EXT4_DIRSYNC_FL;
ei->i_file_acl = 0;
ei->i_dtime = 0;
- ei->i_block_alloc_info = NULL;
ei->i_block_group = group;
ext4_set_inode_flags(inode);
@@ -726,7 +834,7 @@ got:
ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
ret = inode;
- if(DQUOT_ALLOC_INODE(inode)) {
+ if (DQUOT_ALLOC_INODE(inode)) {
err = -EDQUOT;
goto fail_drop;
}
@@ -735,19 +843,15 @@ got:
if (err)
goto fail_free_drop;
- err = ext4_init_security(handle,inode, dir);
+ err = ext4_init_security(handle, inode, dir);
if (err)
goto fail_free_drop;
if (test_opt(sb, EXTENTS)) {
- /* set extent flag only for diretory, file and normal symlink*/
+ /* set extent flag only for directory, file and normal symlink*/
if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
ext4_ext_tree_init(handle, inode);
- err = ext4_update_incompat_feature(handle, sb,
- EXT4_FEATURE_INCOMPAT_EXTENTS);
- if (err)
- goto fail_free_drop;
}
}
@@ -799,7 +903,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
- bitmap_bh = read_inode_bitmap(sb, block_group);
+ bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
if (!bitmap_bh) {
ext4_warning(sb, __func__,
"inode bitmap error for orphan %lu", ino);
@@ -817,6 +921,14 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
if (IS_ERR(inode))
goto iget_failed;
+ /*
+ * If the orphans has i_nlinks > 0 then it should be able to be
+ * truncated, otherwise it won't be removed from the orphan list
+ * during processing and an infinite loop will result.
+ */
+ if (inode->i_nlink && !ext4_can_truncate(inode))
+ goto bad_orphan;
+
if (NEXT_ORPHAN(inode) > max_ino)
goto bad_orphan;
brelse(bitmap_bh);
@@ -838,6 +950,7 @@ bad_orphan:
printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
NEXT_ORPHAN(inode));
printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
+ printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink);
/* Avoid freeing blocks if we got a bad deleted inode */
if (inode->i_nlink == 0)
inode->i_blocks = 0;
@@ -848,7 +961,7 @@ error:
return ERR_PTR(err);
}
-unsigned long ext4_count_free_inodes (struct super_block * sb)
+unsigned long ext4_count_free_inodes(struct super_block *sb)
{
unsigned long desc_count;
struct ext4_group_desc *gdp;
@@ -863,12 +976,12 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
bitmap_count = 0;
gdp = NULL;
for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
- gdp = ext4_get_group_desc (sb, i, NULL);
+ gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
continue;
desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
brelse(bitmap_bh);
- bitmap_bh = read_inode_bitmap(sb, i);
+ bitmap_bh = ext4_read_inode_bitmap(sb, i);
if (!bitmap_bh)
continue;
@@ -878,13 +991,14 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
bitmap_count += x;
}
brelse(bitmap_bh);
- printk("ext4_count_free_inodes: stored = %u, computed = %lu, %lu\n",
- le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
+ printk(KERN_DEBUG "ext4_count_free_inodes: "
+ "stored = %u, computed = %lu, %lu\n",
+ le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
return desc_count;
#else
desc_count = 0;
for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
- gdp = ext4_get_group_desc (sb, i, NULL);
+ gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
continue;
desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
@@ -895,13 +1009,13 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
}
/* Called at mount-time, super-block is locked */
-unsigned long ext4_count_dirs (struct super_block * sb)
+unsigned long ext4_count_dirs(struct super_block * sb)
{
unsigned long count = 0;
ext4_group_t i;
for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
- struct ext4_group_desc *gdp = ext4_get_group_desc (sb, i, NULL);
+ struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
continue;
count += le16_to_cpu(gdp->bg_used_dirs_count);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8d970774641..9b4ec9decfd 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -32,12 +32,25 @@
#include <linux/string.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
+#include <linux/pagevec.h>
#include <linux/mpage.h>
#include <linux/uio.h>
#include <linux/bio.h>
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
+#include "ext4_extents.h"
+
+#define MPAGE_DA_EXTENT_TAIL 0x01
+
+static inline int ext4_begin_ordered_truncate(struct inode *inode,
+ loff_t new_size)
+{
+ return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode,
+ new_size);
+}
+
+static void ext4_invalidatepage(struct page *page, unsigned long offset);
/*
* Test whether an inode is a fast symlink.
@@ -177,17 +190,21 @@ static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
/*
* Called at the last iput() if i_nlink is zero.
*/
-void ext4_delete_inode (struct inode * inode)
+void ext4_delete_inode(struct inode *inode)
{
handle_t *handle;
+ int err;
+ if (ext4_should_order_data(inode))
+ ext4_begin_ordered_truncate(inode, 0);
truncate_inode_pages(&inode->i_data, 0);
if (is_bad_inode(inode))
goto no_delete;
- handle = start_transaction(inode);
+ handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3);
if (IS_ERR(handle)) {
+ ext4_std_error(inode->i_sb, PTR_ERR(handle));
/*
* If we're going to skip the normal cleanup, we still need to
* make sure that the in-core orphan linked list is properly
@@ -200,8 +217,34 @@ void ext4_delete_inode (struct inode * inode)
if (IS_SYNC(inode))
handle->h_sync = 1;
inode->i_size = 0;
+ err = ext4_mark_inode_dirty(handle, inode);
+ if (err) {
+ ext4_warning(inode->i_sb, __func__,
+ "couldn't mark inode dirty (err %d)", err);
+ goto stop_handle;
+ }
if (inode->i_blocks)
ext4_truncate(inode);
+
+ /*
+ * ext4_ext_truncate() doesn't reserve any slop when it
+ * restarts journal transactions; therefore there may not be
+ * enough credits left in the handle to remove the inode from
+ * the orphan list and set the dtime field.
+ */
+ if (handle->h_buffer_credits < 3) {
+ err = ext4_journal_extend(handle, 3);
+ if (err > 0)
+ err = ext4_journal_restart(handle, 3);
+ if (err != 0) {
+ ext4_warning(inode->i_sb, __func__,
+ "couldn't extend journal (err %d)", err);
+ stop_handle:
+ ext4_journal_stop(handle);
+ goto no_delete;
+ }
+ }
+
/*
* Kill off the orphan record which ext4_truncate created.
* AKPM: I think this can be inside the above `if'.
@@ -287,11 +330,11 @@ static int ext4_block_to_path(struct inode *inode,
int final = 0;
if (i_block < 0) {
- ext4_warning (inode->i_sb, "ext4_block_to_path", "block < 0");
+ ext4_warning(inode->i_sb, "ext4_block_to_path", "block < 0");
} else if (i_block < direct_blocks) {
offsets[n++] = i_block;
final = direct_blocks;
- } else if ( (i_block -= direct_blocks) < indirect_blocks) {
+ } else if ((i_block -= direct_blocks) < indirect_blocks) {
offsets[n++] = EXT4_IND_BLOCK;
offsets[n++] = i_block;
final = ptrs;
@@ -357,14 +400,14 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
*err = 0;
/* i_data is not going away, no lock needed */
- add_chain (chain, NULL, EXT4_I(inode)->i_data + *offsets);
+ add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets);
if (!p->key)
goto no_block;
while (--depth) {
bh = sb_bread(sb, le32_to_cpu(p->key));
if (!bh)
goto failure;
- add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
+ add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
/* Reader: end */
if (!p->key)
goto no_block;
@@ -400,7 +443,7 @@ no_block:
static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
{
struct ext4_inode_info *ei = EXT4_I(inode);
- __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
+ __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data;
__le32 *p;
ext4_fsblk_t bg_start;
ext4_fsblk_t last_block;
@@ -443,18 +486,9 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
Indirect *partial)
{
- struct ext4_block_alloc_info *block_i;
-
- block_i = EXT4_I(inode)->i_block_alloc_info;
-
/*
- * try the heuristic for sequential allocation,
- * failing that at least try to get decent locality.
+ * XXX need to get goal block from mballoc's data structures
*/
- if (block_i && (block == block_i->last_alloc_logical_block + 1)
- && (block_i->last_alloc_physical_block != 0)) {
- return block_i->last_alloc_physical_block + 1;
- }
return ext4_find_near(inode, partial);
}
@@ -508,11 +542,12 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
* direct blocks
*/
static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
- ext4_fsblk_t goal, int indirect_blks, int blks,
- ext4_fsblk_t new_blocks[4], int *err)
+ ext4_lblk_t iblock, ext4_fsblk_t goal,
+ int indirect_blks, int blks,
+ ext4_fsblk_t new_blocks[4], int *err)
{
int target, i;
- unsigned long count = 0;
+ unsigned long count = 0, blk_allocated = 0;
int index = 0;
ext4_fsblk_t current_block = 0;
int ret = 0;
@@ -525,12 +560,13 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
* the first direct block of this branch. That's the
* minimum number of blocks need to allocate(required)
*/
- target = blks + indirect_blks;
-
- while (1) {
+ /* first we try to allocate the indirect blocks */
+ target = indirect_blks;
+ while (target > 0) {
count = target;
/* allocating blocks for indirect blocks and direct blocks */
- current_block = ext4_new_blocks(handle,inode,goal,&count,err);
+ current_block = ext4_new_meta_blocks(handle, inode,
+ goal, &count, err);
if (*err)
goto failed_out;
@@ -540,20 +576,52 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
new_blocks[index++] = current_block++;
count--;
}
-
- if (count > 0)
+ if (count > 0) {
+ /*
+ * save the new block number
+ * for the first direct block
+ */
+ new_blocks[index] = current_block;
+ printk(KERN_INFO "%s returned more blocks than "
+ "requested\n", __func__);
+ WARN_ON(1);
break;
+ }
}
- /* save the new block number for the first direct block */
- new_blocks[index] = current_block;
-
+ target = blks - count ;
+ blk_allocated = count;
+ if (!target)
+ goto allocated;
+ /* Now allocate data blocks */
+ count = target;
+ /* allocating blocks for data blocks */
+ current_block = ext4_new_blocks(handle, inode, iblock,
+ goal, &count, err);
+ if (*err && (target == blks)) {
+ /*
+ * if the allocation failed and we didn't allocate
+ * any blocks before
+ */
+ goto failed_out;
+ }
+ if (!*err) {
+ if (target == blks) {
+ /*
+ * save the new block number
+ * for the first direct block
+ */
+ new_blocks[index] = current_block;
+ }
+ blk_allocated += count;
+ }
+allocated:
/* total number of blocks allocated for direct blocks */
- ret = count;
+ ret = blk_allocated;
*err = 0;
return ret;
failed_out:
- for (i = 0; i <index; i++)
+ for (i = 0; i < index; i++)
ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
return ret;
}
@@ -584,8 +652,9 @@ failed_out:
* as described above and return 0.
*/
static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
- int indirect_blks, int *blks, ext4_fsblk_t goal,
- ext4_lblk_t *offsets, Indirect *branch)
+ ext4_lblk_t iblock, int indirect_blks,
+ int *blks, ext4_fsblk_t goal,
+ ext4_lblk_t *offsets, Indirect *branch)
{
int blocksize = inode->i_sb->s_blocksize;
int i, n = 0;
@@ -595,7 +664,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
ext4_fsblk_t new_blocks[4];
ext4_fsblk_t current_block;
- num = ext4_alloc_blocks(handle, inode, goal, indirect_blks,
+ num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
*blks, new_blocks, &err);
if (err)
return err;
@@ -625,7 +694,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
branch[n].p = (__le32 *) bh->b_data + offsets[n];
branch[n].key = cpu_to_le32(new_blocks[n]);
*branch[n].p = branch[n].key;
- if ( n == indirect_blks) {
+ if (n == indirect_blks) {
current_block = new_blocks[n];
/*
* End of chain, update the last new metablock of
@@ -652,7 +721,7 @@ failed:
BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget");
ext4_journal_forget(handle, branch[i].bh);
}
- for (i = 0; i <indirect_blks; i++)
+ for (i = 0; i < indirect_blks; i++)
ext4_free_blocks(handle, inode, new_blocks[i], 1, 0);
ext4_free_blocks(handle, inode, new_blocks[i], num, 0);
@@ -679,10 +748,8 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
{
int i;
int err = 0;
- struct ext4_block_alloc_info *block_i;
ext4_fsblk_t current_block;
- block_i = EXT4_I(inode)->i_block_alloc_info;
/*
* If we're splicing into a [td]indirect block (as opposed to the
* inode) then we need to get write access to the [td]indirect block
@@ -705,18 +772,7 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
if (num == 0 && blks > 1) {
current_block = le32_to_cpu(where->key) + 1;
for (i = 1; i < blks; i++)
- *(where->p + i ) = cpu_to_le32(current_block++);
- }
-
- /*
- * update the most recently allocated logical & physical block
- * in i_block_alloc_info, to assist find the proper goal block for next
- * allocation
- */
- if (block_i) {
- block_i->last_alloc_logical_block = block + blks - 1;
- block_i->last_alloc_physical_block =
- le32_to_cpu(where[num].key) + blks - 1;
+ *(where->p + i) = cpu_to_le32(current_block++);
}
/* We are done with atomic stuff, now do the rest of housekeeping */
@@ -799,6 +855,7 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
struct ext4_inode_info *ei = EXT4_I(inode);
int count = 0;
ext4_fsblk_t first_block = 0;
+ loff_t disksize;
J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
@@ -835,12 +892,8 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
goto cleanup;
/*
- * Okay, we need to do block allocation. Lazily initialize the block
- * allocation info here if necessary
+ * Okay, we need to do block allocation.
*/
- if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
- ext4_init_block_alloc_info(inode);
-
goal = ext4_find_goal(inode, iblock, partial);
/* the number of blocks need to allocate for [d,t]indirect blocks */
@@ -855,8 +908,9 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
/*
* Block out ext4_truncate while we alter the tree
*/
- err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal,
- offsets + (partial - chain), partial);
+ err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
+ &count, goal,
+ offsets + (partial - chain), partial);
/*
* The ext4_splice_branch call will free and forget any buffers
@@ -873,8 +927,13 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
* protect it if you're about to implement concurrent
* ext4_get_block() -bzzz
*/
- if (!err && extend_disksize && inode->i_size > ei->i_disksize)
- ei->i_disksize = inode->i_size;
+ if (!err && extend_disksize) {
+ disksize = ((loff_t) iblock + count) << inode->i_blkbits;
+ if (disksize > i_size_read(inode))
+ disksize = i_size_read(inode);
+ if (disksize > ei->i_disksize)
+ ei->i_disksize = disksize;
+ }
if (err)
goto cleanup;
@@ -897,23 +956,75 @@ out:
return err;
}
-/* Maximum number of blocks we map for direct IO at once. */
-#define DIO_MAX_BLOCKS 4096
/*
- * Number of credits we need for writing DIO_MAX_BLOCKS:
- * We need sb + group descriptor + bitmap + inode -> 4
- * For B blocks with A block pointers per block we need:
- * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect).
- * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25.
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate @blocks for non extent file based file
*/
-#define DIO_CREDITS 25
+static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
+{
+ int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+ int ind_blks, dind_blks, tind_blks;
+
+ /* number of new indirect blocks needed */
+ ind_blks = (blocks + icap - 1) / icap;
+
+ dind_blks = (ind_blks + icap - 1) / icap;
+ tind_blks = 1;
+
+ return ind_blks + dind_blks + tind_blks;
+}
/*
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate given number of blocks
+ */
+static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
+{
+ if (!blocks)
+ return 0;
+
+ if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+ return ext4_ext_calc_metadata_amount(inode, blocks);
+
+ return ext4_indirect_calc_metadata_amount(inode, blocks);
+}
+
+static void ext4_da_update_reserve_space(struct inode *inode, int used)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ int total, mdb, mdb_free;
+
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+ /* recalculate the number of metablocks still need to be reserved */
+ total = EXT4_I(inode)->i_reserved_data_blocks - used;
+ mdb = ext4_calc_metadata_amount(inode, total);
+
+ /* figure out how many metablocks to release */
+ BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+ mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
+
+ if (mdb_free) {
+ /* Account for allocated meta_blocks */
+ mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
+
+ /* update fs dirty blocks counter */
+ percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
+ EXT4_I(inode)->i_allocated_meta_blocks = 0;
+ EXT4_I(inode)->i_reserved_meta_blocks = mdb;
+ }
+
+ /* update per-inode reservations */
+ BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks);
+ EXT4_I(inode)->i_reserved_data_blocks -= used;
+
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+}
+
+/*
+ * The ext4_get_blocks_wrap() function try to look up the requested blocks,
+ * and returns if the blocks are already mapped.
*
- *
- * ext4_ext4 get_block() wrapper function
- * It will do a look up first, and returns if the blocks already mapped.
* Otherwise it takes the write lock of the i_data_sem and allocate blocks
* and store the allocated blocks in the result buffer head and mark it
* mapped.
@@ -934,7 +1045,7 @@ out:
*/
int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
unsigned long max_blocks, struct buffer_head *bh,
- int create, int extend_disksize)
+ int create, int extend_disksize, int flag)
{
int retval;
@@ -975,6 +1086,15 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
* with create == 1 flag.
*/
down_write((&EXT4_I(inode)->i_data_sem));
+
+ /*
+ * if the caller is from delayed allocation writeout path
+ * we have already reserved fs blocks for allocation
+ * let the underlying get_block() function know to
+ * avoid double accounting
+ */
+ if (flag)
+ EXT4_I(inode)->i_delalloc_reserved_flag = 1;
/*
* We need to check for EXT4 here because migrate
* could have changed the inode type in between
@@ -996,23 +1116,39 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
~EXT4_EXT_MIGRATE;
}
}
+
+ if (flag) {
+ EXT4_I(inode)->i_delalloc_reserved_flag = 0;
+ /*
+ * Update reserved blocks/metadata blocks
+ * after successful block allocation
+ * which were deferred till now
+ */
+ if ((retval > 0) && buffer_delay(bh))
+ ext4_da_update_reserve_space(inode, retval);
+ }
+
up_write((&EXT4_I(inode)->i_data_sem));
return retval;
}
-static int ext4_get_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
+/* Maximum number of blocks we map for direct IO at once. */
+#define DIO_MAX_BLOCKS 4096
+
+int ext4_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
{
handle_t *handle = ext4_journal_current_handle();
int ret = 0, started = 0;
unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+ int dio_credits;
if (create && !handle) {
/* Direct IO write... */
if (max_blocks > DIO_MAX_BLOCKS)
max_blocks = DIO_MAX_BLOCKS;
- handle = ext4_journal_start(inode, DIO_CREDITS +
- 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb));
+ dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
+ handle = ext4_journal_start(inode, dio_credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out;
@@ -1021,7 +1157,7 @@ static int ext4_get_block(struct inode *inode, sector_t iblock,
}
ret = ext4_get_blocks_wrap(handle, inode, iblock,
- max_blocks, bh_result, create, 0);
+ max_blocks, bh_result, create, 0, 0);
if (ret > 0) {
bh_result->b_size = (ret << inode->i_blkbits);
ret = 0;
@@ -1047,7 +1183,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
dummy.b_blocknr = -1000;
buffer_trace_init(&dummy.b_history);
err = ext4_get_blocks_wrap(handle, inode, block, 1,
- &dummy, create, 1);
+ &dummy, create, 1, 0);
/*
* ext4_get_blocks_handle() returns number of blocks
* mapped. 0 in case of a HOLE.
@@ -1080,7 +1216,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
BUFFER_TRACE(bh, "call get_create_access");
fatal = ext4_journal_get_create_access(handle, bh);
if (!fatal && !buffer_uptodate(bh)) {
- memset(bh->b_data,0,inode->i_sb->s_blocksize);
+ memset(bh->b_data, 0, inode->i_sb->s_blocksize);
set_buffer_uptodate(bh);
}
unlock_buffer(bh);
@@ -1105,7 +1241,7 @@ err:
struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
ext4_lblk_t block, int create, int *err)
{
- struct buffer_head * bh;
+ struct buffer_head *bh;
bh = ext4_getblk(handle, inode, block, create, err);
if (!bh)
@@ -1121,13 +1257,13 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
return NULL;
}
-static int walk_page_buffers( handle_t *handle,
- struct buffer_head *head,
- unsigned from,
- unsigned to,
- int *partial,
- int (*fn)( handle_t *handle,
- struct buffer_head *bh))
+static int walk_page_buffers(handle_t *handle,
+ struct buffer_head *head,
+ unsigned from,
+ unsigned to,
+ int *partial,
+ int (*fn)(handle_t *handle,
+ struct buffer_head *bh))
{
struct buffer_head *bh;
unsigned block_start, block_end;
@@ -1135,9 +1271,9 @@ static int walk_page_buffers( handle_t *handle,
int err, ret = 0;
struct buffer_head *next;
- for ( bh = head, block_start = 0;
- ret == 0 && (bh != head || !block_start);
- block_start = block_end, bh = next)
+ for (bh = head, block_start = 0;
+ ret == 0 && (bh != head || !block_start);
+ block_start = block_end, bh = next)
{
next = bh->b_this_page;
block_end = block_start + blocksize;
@@ -1190,31 +1326,32 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
- struct inode *inode = mapping->host;
+ struct inode *inode = mapping->host;
int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
handle_t *handle;
int retries = 0;
- struct page *page;
+ struct page *page;
pgoff_t index;
- unsigned from, to;
+ unsigned from, to;
index = pos >> PAGE_CACHE_SHIFT;
- from = pos & (PAGE_CACHE_SIZE - 1);
- to = from + len;
+ from = pos & (PAGE_CACHE_SIZE - 1);
+ to = from + len;
retry:
- page = __grab_cache_page(mapping, index);
- if (!page)
- return -ENOMEM;
- *pagep = page;
+ handle = ext4_journal_start(inode, needed_blocks);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
- handle = ext4_journal_start(inode, needed_blocks);
- if (IS_ERR(handle)) {
- unlock_page(page);
- page_cache_release(page);
- ret = PTR_ERR(handle);
- goto out;
+ page = __grab_cache_page(mapping, index);
+ if (!page) {
+ ext4_journal_stop(handle);
+ ret = -ENOMEM;
+ goto out;
}
+ *pagep = page;
ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
ext4_get_block);
@@ -1225,9 +1362,16 @@ retry:
}
if (ret) {
+ unlock_page(page);
ext4_journal_stop(handle);
- unlock_page(page);
- page_cache_release(page);
+ page_cache_release(page);
+ /*
+ * block_write_begin may have instantiated a few blocks
+ * outside i_size. Trim these off again. Don't need
+ * i_size_read because we hold i_mutex.
+ */
+ if (pos + len > inode->i_size)
+ vmtruncate(inode, inode->i_size);
}
if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -1236,15 +1380,6 @@ out:
return ret;
}
-int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
-{
- int err = jbd2_journal_dirty_data(handle, bh);
- if (err)
- ext4_journal_abort_handle(__func__, __func__,
- bh, handle, err);
- return err;
-}
-
/* For write_end() in data=journal mode */
static int write_end_fn(handle_t *handle, struct buffer_head *bh)
{
@@ -1255,29 +1390,6 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
}
/*
- * Generic write_end handler for ordered and writeback ext4 journal modes.
- * We can't use generic_write_end, because that unlocks the page and we need to
- * unlock the page after ext4_journal_stop, but ext4_journal_stop must run
- * after block_write_end.
- */
-static int ext4_generic_write_end(struct file *file,
- struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
-{
- struct inode *inode = file->f_mapping->host;
-
- copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
-
- if (pos+copied > inode->i_size) {
- i_size_write(inode, pos+copied);
- mark_inode_dirty(inode);
- }
-
- return copied;
-}
-
-/*
* We need to pick up the new inode size which generic_commit_write gave us
* `file' can be NULL - eg, when called from page_symlink().
*
@@ -1290,28 +1402,25 @@ static int ext4_ordered_write_end(struct file *file,
struct page *page, void *fsdata)
{
handle_t *handle = ext4_journal_current_handle();
- struct inode *inode = file->f_mapping->host;
- unsigned from, to;
+ struct inode *inode = mapping->host;
int ret = 0, ret2;
- from = pos & (PAGE_CACHE_SIZE - 1);
- to = from + len;
-
- ret = walk_page_buffers(handle, page_buffers(page),
- from, to, NULL, ext4_journal_dirty_data);
+ ret = ext4_jbd2_file_inode(handle, inode);
if (ret == 0) {
- /*
- * generic_write_end() will run mark_inode_dirty() if i_size
- * changes. So let's piggyback the i_disksize mark_inode_dirty
- * into that.
- */
loff_t new_i_size;
new_i_size = pos + copied;
- if (new_i_size > EXT4_I(inode)->i_disksize)
- EXT4_I(inode)->i_disksize = new_i_size;
- ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
+ if (new_i_size > EXT4_I(inode)->i_disksize) {
+ ext4_update_i_disksize(inode, new_i_size);
+ /* We need to mark inode dirty even if
+ * new_i_size is less that inode->i_size
+ * bu greater than i_disksize.(hint delalloc)
+ */
+ ext4_mark_inode_dirty(handle, inode);
+ }
+
+ ret2 = generic_write_end(file, mapping, pos, len, copied,
page, fsdata);
copied = ret2;
if (ret2 < 0)
@@ -1320,8 +1429,6 @@ static int ext4_ordered_write_end(struct file *file,
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
- unlock_page(page);
- page_cache_release(page);
return ret ? ret : copied;
}
@@ -1332,15 +1439,21 @@ static int ext4_writeback_write_end(struct file *file,
struct page *page, void *fsdata)
{
handle_t *handle = ext4_journal_current_handle();
- struct inode *inode = file->f_mapping->host;
+ struct inode *inode = mapping->host;
int ret = 0, ret2;
loff_t new_i_size;
new_i_size = pos + copied;
- if (new_i_size > EXT4_I(inode)->i_disksize)
- EXT4_I(inode)->i_disksize = new_i_size;
+ if (new_i_size > EXT4_I(inode)->i_disksize) {
+ ext4_update_i_disksize(inode, new_i_size);
+ /* We need to mark inode dirty even if
+ * new_i_size is less that inode->i_size
+ * bu greater than i_disksize.(hint delalloc)
+ */
+ ext4_mark_inode_dirty(handle, inode);
+ }
- ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
+ ret2 = generic_write_end(file, mapping, pos, len, copied,
page, fsdata);
copied = ret2;
if (ret2 < 0)
@@ -1349,8 +1462,6 @@ static int ext4_writeback_write_end(struct file *file,
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
- unlock_page(page);
- page_cache_release(page);
return ret ? ret : copied;
}
@@ -1365,6 +1476,7 @@ static int ext4_journalled_write_end(struct file *file,
int ret = 0, ret2;
int partial = 0;
unsigned from, to;
+ loff_t new_i_size;
from = pos & (PAGE_CACHE_SIZE - 1);
to = from + len;
@@ -1379,25 +1491,1181 @@ static int ext4_journalled_write_end(struct file *file,
to, &partial, write_end_fn);
if (!partial)
SetPageUptodate(page);
- if (pos+copied > inode->i_size)
+ new_i_size = pos + copied;
+ if (new_i_size > inode->i_size)
i_size_write(inode, pos+copied);
EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
- if (inode->i_size > EXT4_I(inode)->i_disksize) {
- EXT4_I(inode)->i_disksize = inode->i_size;
+ if (new_i_size > EXT4_I(inode)->i_disksize) {
+ ext4_update_i_disksize(inode, new_i_size);
ret2 = ext4_mark_inode_dirty(handle, inode);
if (!ret)
ret = ret2;
}
+ unlock_page(page);
ret2 = ext4_journal_stop(handle);
if (!ret)
ret = ret2;
- unlock_page(page);
page_cache_release(page);
return ret ? ret : copied;
}
+static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
+{
+ int retries = 0;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ unsigned long md_needed, mdblocks, total = 0;
+
+ /*
+ * recalculate the amount of metadata blocks to reserve
+ * in order to allocate nrblocks
+ * worse case is one extent per block
+ */
+repeat:
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+ total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
+ mdblocks = ext4_calc_metadata_amount(inode, total);
+ BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks);
+
+ md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
+ total = md_needed + nrblocks;
+
+ if (ext4_claim_free_blocks(sbi, total)) {
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+ if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
+ yield();
+ goto repeat;
+ }
+ return -ENOSPC;
+ }
+ EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
+ EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
+
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+ return 0; /* success */
+}
+
+static void ext4_da_release_space(struct inode *inode, int to_free)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ int total, mdb, mdb_free, release;
+
+ if (!to_free)
+ return; /* Nothing to release, exit */
+
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+
+ if (!EXT4_I(inode)->i_reserved_data_blocks) {
+ /*
+ * if there is no reserved blocks, but we try to free some
+ * then the counter is messed up somewhere.
+ * but since this function is called from invalidate
+ * page, it's harmless to return without any action
+ */
+ printk(KERN_INFO "ext4 delalloc try to release %d reserved "
+ "blocks for inode %lu, but there is no reserved "
+ "data blocks\n", to_free, inode->i_ino);
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+ return;
+ }
+
+ /* recalculate the number of metablocks still need to be reserved */
+ total = EXT4_I(inode)->i_reserved_data_blocks - to_free;
+ mdb = ext4_calc_metadata_amount(inode, total);
+
+ /* figure out how many metablocks to release */
+ BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+ mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
+
+ release = to_free + mdb_free;
+
+ /* update fs dirty blocks counter for truncate case */
+ percpu_counter_sub(&sbi->s_dirtyblocks_counter, release);
+
+ /* update per-inode reservations */
+ BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks);
+ EXT4_I(inode)->i_reserved_data_blocks -= to_free;
+
+ BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+ EXT4_I(inode)->i_reserved_meta_blocks = mdb;
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+}
+
+static void ext4_da_page_release_reservation(struct page *page,
+ unsigned long offset)
+{
+ int to_release = 0;
+ struct buffer_head *head, *bh;
+ unsigned int curr_off = 0;
+
+ head = page_buffers(page);
+ bh = head;
+ do {
+ unsigned int next_off = curr_off + bh->b_size;
+
+ if ((offset <= curr_off) && (buffer_delay(bh))) {
+ to_release++;
+ clear_buffer_delay(bh);
+ }
+ curr_off = next_off;
+ } while ((bh = bh->b_this_page) != head);
+ ext4_da_release_space(page->mapping->host, to_release);
+}
+
+/*
+ * Delayed allocation stuff
+ */
+
+struct mpage_da_data {
+ struct inode *inode;
+ struct buffer_head lbh; /* extent of blocks */
+ unsigned long first_page, next_page; /* extent of pages */
+ get_block_t *get_block;
+ struct writeback_control *wbc;
+ int io_done;
+ long pages_written;
+ int retval;
+};
+
+/*
+ * mpage_da_submit_io - walks through extent of pages and try to write
+ * them with writepage() call back
+ *
+ * @mpd->inode: inode
+ * @mpd->first_page: first page of the extent
+ * @mpd->next_page: page after the last page of the extent
+ * @mpd->get_block: the filesystem's block mapper function
+ *
+ * By the time mpage_da_submit_io() is called we expect all blocks
+ * to be allocated. this may be wrong if allocation failed.
+ *
+ * As pages are already locked by write_cache_pages(), we can't use it
+ */
+static int mpage_da_submit_io(struct mpage_da_data *mpd)
+{
+ struct address_space *mapping = mpd->inode->i_mapping;
+ int ret = 0, err, nr_pages, i;
+ unsigned long index, end;
+ struct pagevec pvec;
+
+ BUG_ON(mpd->next_page <= mpd->first_page);
+ pagevec_init(&pvec, 0);
+ index = mpd->first_page;
+ end = mpd->next_page - 1;
+
+ while (index <= end) {
+ /* XXX: optimize tail */
+ nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+ if (nr_pages == 0)
+ break;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ index = page->index;
+ if (index > end)
+ break;
+ index++;
+
+ err = mapping->a_ops->writepage(page, mpd->wbc);
+ if (!err)
+ mpd->pages_written++;
+ /*
+ * In error case, we have to continue because
+ * remaining pages are still locked
+ * XXX: unlock and re-dirty them?
+ */
+ if (ret == 0)
+ ret = err;
+ }
+ pagevec_release(&pvec);
+ }
+ return ret;
+}
+
+/*
+ * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
+ *
+ * @mpd->inode - inode to walk through
+ * @exbh->b_blocknr - first block on a disk
+ * @exbh->b_size - amount of space in bytes
+ * @logical - first logical block to start assignment with
+ *
+ * the function goes through all passed space and put actual disk
+ * block numbers into buffer heads, dropping BH_Delay
+ */
+static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
+ struct buffer_head *exbh)
+{
+ struct inode *inode = mpd->inode;
+ struct address_space *mapping = inode->i_mapping;
+ int blocks = exbh->b_size >> inode->i_blkbits;
+ sector_t pblock = exbh->b_blocknr, cur_logical;
+ struct buffer_head *head, *bh;
+ pgoff_t index, end;
+ struct pagevec pvec;
+ int nr_pages, i;
+
+ index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+ pagevec_init(&pvec, 0);
+
+ while (index <= end) {
+ /* XXX: optimize tail */
+ nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+ if (nr_pages == 0)
+ break;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ index = page->index;
+ if (index > end)
+ break;
+ index++;
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(PageWriteback(page));
+ BUG_ON(!page_has_buffers(page));
+
+ bh = page_buffers(page);
+ head = bh;
+
+ /* skip blocks out of the range */
+ do {
+ if (cur_logical >= logical)
+ break;
+ cur_logical++;
+ } while ((bh = bh->b_this_page) != head);
+
+ do {
+ if (cur_logical >= logical + blocks)
+ break;
+ if (buffer_delay(bh)) {
+ bh->b_blocknr = pblock;
+ clear_buffer_delay(bh);
+ bh->b_bdev = inode->i_sb->s_bdev;
+ } else if (buffer_unwritten(bh)) {
+ bh->b_blocknr = pblock;
+ clear_buffer_unwritten(bh);
+ set_buffer_mapped(bh);
+ set_buffer_new(bh);
+ bh->b_bdev = inode->i_sb->s_bdev;
+ } else if (buffer_mapped(bh))
+ BUG_ON(bh->b_blocknr != pblock);
+
+ cur_logical++;
+ pblock++;
+ } while ((bh = bh->b_this_page) != head);
+ }
+ pagevec_release(&pvec);
+ }
+}
+
+
+/*
+ * __unmap_underlying_blocks - just a helper function to unmap
+ * set of blocks described by @bh
+ */
+static inline void __unmap_underlying_blocks(struct inode *inode,
+ struct buffer_head *bh)
+{
+ struct block_device *bdev = inode->i_sb->s_bdev;
+ int blocks, i;
+
+ blocks = bh->b_size >> inode->i_blkbits;
+ for (i = 0; i < blocks; i++)
+ unmap_underlying_metadata(bdev, bh->b_blocknr + i);
+}
+
+static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
+ sector_t logical, long blk_cnt)
+{
+ int nr_pages, i;
+ pgoff_t index, end;
+ struct pagevec pvec;
+ struct inode *inode = mpd->inode;
+ struct address_space *mapping = inode->i_mapping;
+
+ index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ end = (logical + blk_cnt - 1) >>
+ (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ while (index <= end) {
+ nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+ if (nr_pages == 0)
+ break;
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+ index = page->index;
+ if (index > end)
+ break;
+ index++;
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(PageWriteback(page));
+ block_invalidatepage(page, 0);
+ ClearPageUptodate(page);
+ unlock_page(page);
+ }
+ }
+ return;
+}
+
+static void ext4_print_free_blocks(struct inode *inode)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ printk(KERN_EMERG "Total free blocks count %lld\n",
+ ext4_count_free_blocks(inode->i_sb));
+ printk(KERN_EMERG "Free/Dirty block details\n");
+ printk(KERN_EMERG "free_blocks=%lld\n",
+ percpu_counter_sum(&sbi->s_freeblocks_counter));
+ printk(KERN_EMERG "dirty_blocks=%lld\n",
+ percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+ printk(KERN_EMERG "Block reservation details\n");
+ printk(KERN_EMERG "i_reserved_data_blocks=%lu\n",
+ EXT4_I(inode)->i_reserved_data_blocks);
+ printk(KERN_EMERG "i_reserved_meta_blocks=%lu\n",
+ EXT4_I(inode)->i_reserved_meta_blocks);
+ return;
+}
+
+/*
+ * mpage_da_map_blocks - go through given space
+ *
+ * @mpd->lbh - bh describing space
+ * @mpd->get_block - the filesystem's block mapper function
+ *
+ * The function skips space we know is already mapped to disk blocks.
+ *
+ */
+static int mpage_da_map_blocks(struct mpage_da_data *mpd)
+{
+ int err = 0;
+ struct buffer_head new;
+ struct buffer_head *lbh = &mpd->lbh;
+ sector_t next;
+
+ /*
+ * We consider only non-mapped and non-allocated blocks
+ */
+ if (buffer_mapped(lbh) && !buffer_delay(lbh))
+ return 0;
+ new.b_state = lbh->b_state;
+ new.b_blocknr = 0;
+ new.b_size = lbh->b_size;
+ next = lbh->b_blocknr;
+ /*
+ * If we didn't accumulate anything
+ * to write simply return
+ */
+ if (!new.b_size)
+ return 0;
+ err = mpd->get_block(mpd->inode, next, &new, 1);
+ if (err) {
+
+ /* If get block returns with error
+ * we simply return. Later writepage
+ * will redirty the page and writepages
+ * will find the dirty page again
+ */
+ if (err == -EAGAIN)
+ return 0;
+
+ if (err == -ENOSPC &&
+ ext4_count_free_blocks(mpd->inode->i_sb)) {
+ mpd->retval = err;
+ return 0;
+ }
+
+ /*
+ * get block failure will cause us
+ * to loop in writepages. Because
+ * a_ops->writepage won't be able to
+ * make progress. The page will be redirtied
+ * by writepage and writepages will again
+ * try to write the same.
+ */
+ printk(KERN_EMERG "%s block allocation failed for inode %lu "
+ "at logical offset %llu with max blocks "
+ "%zd with error %d\n",
+ __func__, mpd->inode->i_ino,
+ (unsigned long long)next,
+ lbh->b_size >> mpd->inode->i_blkbits, err);
+ printk(KERN_EMERG "This should not happen.!! "
+ "Data will be lost\n");
+ if (err == -ENOSPC) {
+ ext4_print_free_blocks(mpd->inode);
+ }
+ /* invlaidate all the pages */
+ ext4_da_block_invalidatepages(mpd, next,
+ lbh->b_size >> mpd->inode->i_blkbits);
+ return err;
+ }
+ BUG_ON(new.b_size == 0);
+
+ if (buffer_new(&new))
+ __unmap_underlying_blocks(mpd->inode, &new);
+
+ /*
+ * If blocks are delayed marked, we need to
+ * put actual blocknr and drop delayed bit
+ */
+ if (buffer_delay(lbh) || buffer_unwritten(lbh))
+ mpage_put_bnr_to_bhs(mpd, next, &new);
+
+ return 0;
+}
+
+#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
+ (1 << BH_Delay) | (1 << BH_Unwritten))
+
+/*
+ * mpage_add_bh_to_extent - try to add one more block to extent of blocks
+ *
+ * @mpd->lbh - extent of blocks
+ * @logical - logical number of the block in the file
+ * @bh - bh of the block (used to access block's state)
+ *
+ * the function is used to collect contig. blocks in same state
+ */
+static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
+ sector_t logical, struct buffer_head *bh)
+{
+ sector_t next;
+ size_t b_size = bh->b_size;
+ struct buffer_head *lbh = &mpd->lbh;
+ int nrblocks = lbh->b_size >> mpd->inode->i_blkbits;
+
+ /* check if thereserved journal credits might overflow */
+ if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
+ if (nrblocks >= EXT4_MAX_TRANS_DATA) {
+ /*
+ * With non-extent format we are limited by the journal
+ * credit available. Total credit needed to insert
+ * nrblocks contiguous blocks is dependent on the
+ * nrblocks. So limit nrblocks.
+ */
+ goto flush_it;
+ } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) >
+ EXT4_MAX_TRANS_DATA) {
+ /*
+ * Adding the new buffer_head would make it cross the
+ * allowed limit for which we have journal credit
+ * reserved. So limit the new bh->b_size
+ */
+ b_size = (EXT4_MAX_TRANS_DATA - nrblocks) <<
+ mpd->inode->i_blkbits;
+ /* we will do mpage_da_submit_io in the next loop */
+ }
+ }
+ /*
+ * First block in the extent
+ */
+ if (lbh->b_size == 0) {
+ lbh->b_blocknr = logical;
+ lbh->b_size = b_size;
+ lbh->b_state = bh->b_state & BH_FLAGS;
+ return;
+ }
+
+ next = lbh->b_blocknr + nrblocks;
+ /*
+ * Can we merge the block to our big extent?
+ */
+ if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
+ lbh->b_size += b_size;
+ return;
+ }
+
+flush_it:
+ /*
+ * We couldn't merge the block to our extent, so we
+ * need to flush current extent and start new one
+ */
+ if (mpage_da_map_blocks(mpd) == 0)
+ mpage_da_submit_io(mpd);
+ mpd->io_done = 1;
+ return;
+}
+
+/*
+ * __mpage_da_writepage - finds extent of pages and blocks
+ *
+ * @page: page to consider
+ * @wbc: not used, we just follow rules
+ * @data: context
+ *
+ * The function finds extents of pages and scan them for all blocks.
+ */
+static int __mpage_da_writepage(struct page *page,
+ struct writeback_control *wbc, void *data)
+{
+ struct mpage_da_data *mpd = data;
+ struct inode *inode = mpd->inode;
+ struct buffer_head *bh, *head, fake;
+ sector_t logical;
+
+ if (mpd->io_done) {
+ /*
+ * Rest of the page in the page_vec
+ * redirty then and skip then. We will
+ * try to to write them again after
+ * starting a new transaction
+ */
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return MPAGE_DA_EXTENT_TAIL;
+ }
+ /*
+ * Can we merge this page to current extent?
+ */
+ if (mpd->next_page != page->index) {
+ /*
+ * Nope, we can't. So, we map non-allocated blocks
+ * and start IO on them using writepage()
+ */
+ if (mpd->next_page != mpd->first_page) {
+ if (mpage_da_map_blocks(mpd) == 0)
+ mpage_da_submit_io(mpd);
+ /*
+ * skip rest of the page in the page_vec
+ */
+ mpd->io_done = 1;
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return MPAGE_DA_EXTENT_TAIL;
+ }
+
+ /*
+ * Start next extent of pages ...
+ */
+ mpd->first_page = page->index;
+
+ /*
+ * ... and blocks
+ */
+ mpd->lbh.b_size = 0;
+ mpd->lbh.b_state = 0;
+ mpd->lbh.b_blocknr = 0;
+ }
+
+ mpd->next_page = page->index + 1;
+ logical = (sector_t) page->index <<
+ (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+ if (!page_has_buffers(page)) {
+ /*
+ * There is no attached buffer heads yet (mmap?)
+ * we treat the page asfull of dirty blocks
+ */
+ bh = &fake;
+ bh->b_size = PAGE_CACHE_SIZE;
+ bh->b_state = 0;
+ set_buffer_dirty(bh);
+ set_buffer_uptodate(bh);
+ mpage_add_bh_to_extent(mpd, logical, bh);
+ if (mpd->io_done)
+ return MPAGE_DA_EXTENT_TAIL;
+ } else {
+ /*
+ * Page with regular buffer heads, just add all dirty ones
+ */
+ head = page_buffers(page);
+ bh = head;
+ do {
+ BUG_ON(buffer_locked(bh));
+ if (buffer_dirty(bh) &&
+ (!buffer_mapped(bh) || buffer_delay(bh))) {
+ mpage_add_bh_to_extent(mpd, logical, bh);
+ if (mpd->io_done)
+ return MPAGE_DA_EXTENT_TAIL;
+ }
+ logical++;
+ } while ((bh = bh->b_this_page) != head);
+ }
+
+ return 0;
+}
+
+/*
+ * mpage_da_writepages - walk the list of dirty pages of the given
+ * address space, allocates non-allocated blocks, maps newly-allocated
+ * blocks to existing bhs and issue IO them
+ *
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @get_block: the filesystem's block mapper function.
+ *
+ * This is a library function, which implements the writepages()
+ * address_space_operation.
+ */
+static int mpage_da_writepages(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct mpage_da_data *mpd)
+{
+ long to_write;
+ int ret;
+
+ if (!mpd->get_block)
+ return generic_writepages(mapping, wbc);
+
+ mpd->lbh.b_size = 0;
+ mpd->lbh.b_state = 0;
+ mpd->lbh.b_blocknr = 0;
+ mpd->first_page = 0;
+ mpd->next_page = 0;
+ mpd->io_done = 0;
+ mpd->pages_written = 0;
+ mpd->retval = 0;
+
+ to_write = wbc->nr_to_write;
+
+ ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
+
+ /*
+ * Handle last extent of pages
+ */
+ if (!mpd->io_done && mpd->next_page != mpd->first_page) {
+ if (mpage_da_map_blocks(mpd) == 0)
+ mpage_da_submit_io(mpd);
+ }
+
+ wbc->nr_to_write = to_write - mpd->pages_written;
+ return ret;
+}
+
+/*
+ * this is a special callback for ->write_begin() only
+ * it's intention is to return mapped block or reserve space
+ */
+static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ int ret = 0;
+
+ BUG_ON(create == 0);
+ BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+
+ /*
+ * first, we need to know whether the block is allocated already
+ * preallocated blocks are unmapped but should treated
+ * the same as allocated blocks.
+ */
+ ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0, 0);
+ if ((ret == 0) && !buffer_delay(bh_result)) {
+ /* the block isn't (pre)allocated yet, let's reserve space */
+ /*
+ * XXX: __block_prepare_write() unmaps passed block,
+ * is it OK?
+ */
+ ret = ext4_da_reserve_space(inode, 1);
+ if (ret)
+ /* not enough space to reserve */
+ return ret;
+
+ map_bh(bh_result, inode->i_sb, 0);
+ set_buffer_new(bh_result);
+ set_buffer_delay(bh_result);
+ } else if (ret > 0) {
+ bh_result->b_size = (ret << inode->i_blkbits);
+ ret = 0;
+ }
+
+ return ret;
+}
+#define EXT4_DELALLOC_RSVED 1
+static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ int ret;
+ unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+ loff_t disksize = EXT4_I(inode)->i_disksize;
+ handle_t *handle = NULL;
+
+ handle = ext4_journal_current_handle();
+ BUG_ON(!handle);
+ ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+ bh_result, create, 0, EXT4_DELALLOC_RSVED);
+ if (ret > 0) {
+
+ bh_result->b_size = (ret << inode->i_blkbits);
+
+ if (ext4_should_order_data(inode)) {
+ int retval;
+ retval = ext4_jbd2_file_inode(handle, inode);
+ if (retval)
+ /*
+ * Failed to add inode for ordered
+ * mode. Don't update file size
+ */
+ return retval;
+ }
+
+ /*
+ * Update on-disk size along with block allocation
+ * we don't use 'extend_disksize' as size may change
+ * within already allocated block -bzzz
+ */
+ disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
+ if (disksize > i_size_read(inode))
+ disksize = i_size_read(inode);
+ if (disksize > EXT4_I(inode)->i_disksize) {
+ ext4_update_i_disksize(inode, disksize);
+ ret = ext4_mark_inode_dirty(handle, inode);
+ return ret;
+ }
+ ret = 0;
+ }
+ return ret;
+}
+
+static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
+{
+ /*
+ * unmapped buffer is possible for holes.
+ * delay buffer is possible with delayed allocation
+ */
+ return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh));
+}
+
+static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ int ret = 0;
+ unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+
+ /*
+ * we don't want to do block allocation in writepage
+ * so call get_block_wrap with create = 0
+ */
+ ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks,
+ bh_result, 0, 0, 0);
+ if (ret > 0) {
+ bh_result->b_size = (ret << inode->i_blkbits);
+ ret = 0;
+ }
+ return ret;
+}
+
+/*
+ * get called vi ext4_da_writepages after taking page lock (have journal handle)
+ * get called via journal_submit_inode_data_buffers (no journal handle)
+ * get called via shrink_page_list via pdflush (no journal handle)
+ * or grab_page_cache when doing write_begin (have journal handle)
+ */
+static int ext4_da_writepage(struct page *page,
+ struct writeback_control *wbc)
+{
+ int ret = 0;
+ loff_t size;
+ unsigned long len;
+ struct buffer_head *page_bufs;
+ struct inode *inode = page->mapping->host;
+
+ size = i_size_read(inode);
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+
+ if (page_has_buffers(page)) {
+ page_bufs = page_buffers(page);
+ if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+ ext4_bh_unmapped_or_delay)) {
+ /*
+ * We don't want to do block allocation
+ * So redirty the page and return
+ * We may reach here when we do a journal commit
+ * via journal_submit_inode_data_buffers.
+ * If we don't have mapping block we just ignore
+ * them. We can also reach here via shrink_page_list
+ */
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+ }
+ } else {
+ /*
+ * The test for page_has_buffers() is subtle:
+ * We know the page is dirty but it lost buffers. That means
+ * that at some moment in time after write_begin()/write_end()
+ * has been called all buffers have been clean and thus they
+ * must have been written at least once. So they are all
+ * mapped and we can happily proceed with mapping them
+ * and writing the page.
+ *
+ * Try to initialize the buffer_heads and check whether
+ * all are mapped and non delay. We don't want to
+ * do block allocation here.
+ */
+ ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+ ext4_normal_get_block_write);
+ if (!ret) {
+ page_bufs = page_buffers(page);
+ /* check whether all are mapped and non delay */
+ if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+ ext4_bh_unmapped_or_delay)) {
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+ }
+ } else {
+ /*
+ * We can't do block allocation here
+ * so just redity the page and unlock
+ * and return
+ */
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+ }
+ }
+
+ if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
+ ret = nobh_writepage(page, ext4_normal_get_block_write, wbc);
+ else
+ ret = block_write_full_page(page,
+ ext4_normal_get_block_write,
+ wbc);
+
+ return ret;
+}
+
+/*
+ * This is called via ext4_da_writepages() to
+ * calulate the total number of credits to reserve to fit
+ * a single extent allocation into a single transaction,
+ * ext4_da_writpeages() will loop calling this before
+ * the block allocation.
+ */
+
+static int ext4_da_writepages_trans_blocks(struct inode *inode)
+{
+ int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+
+ /*
+ * With non-extent format the journal credit needed to
+ * insert nrblocks contiguous block is dependent on
+ * number of contiguous block. So we will limit
+ * number of contiguous block to a sane value
+ */
+ if (!(inode->i_flags & EXT4_EXTENTS_FL) &&
+ (max_blocks > EXT4_MAX_TRANS_DATA))
+ max_blocks = EXT4_MAX_TRANS_DATA;
+
+ return ext4_chunk_trans_blocks(inode, max_blocks);
+}
+
+static int ext4_da_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ handle_t *handle = NULL;
+ loff_t range_start = 0;
+ struct mpage_da_data mpd;
+ struct inode *inode = mapping->host;
+ int needed_blocks, ret = 0, nr_to_writebump = 0;
+ long to_write, pages_skipped = 0;
+ struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
+
+ /*
+ * No pages to write? This is mainly a kludge to avoid starting
+ * a transaction for special inodes like journal inode on last iput()
+ * because that could violate lock ordering on umount
+ */
+ if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
+ return 0;
+ /*
+ * Make sure nr_to_write is >= sbi->s_mb_stream_request
+ * This make sure small files blocks are allocated in
+ * single attempt. This ensure that small files
+ * get less fragmented.
+ */
+ if (wbc->nr_to_write < sbi->s_mb_stream_request) {
+ nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
+ wbc->nr_to_write = sbi->s_mb_stream_request;
+ }
+
+ if (!wbc->range_cyclic)
+ /*
+ * If range_cyclic is not set force range_cont
+ * and save the old writeback_index
+ */
+ wbc->range_cont = 1;
+
+ range_start = wbc->range_start;
+ pages_skipped = wbc->pages_skipped;
+
+ mpd.wbc = wbc;
+ mpd.inode = mapping->host;
+
+restart_loop:
+ to_write = wbc->nr_to_write;
+ while (!ret && to_write > 0) {
+
+ /*
+ * we insert one extent at a time. So we need
+ * credit needed for single extent allocation.
+ * journalled mode is currently not supported
+ * by delalloc
+ */
+ BUG_ON(ext4_should_journal_data(inode));
+ needed_blocks = ext4_da_writepages_trans_blocks(inode);
+
+ /* start a new transaction*/
+ handle = ext4_journal_start(inode, needed_blocks);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ printk(KERN_EMERG "%s: jbd2_start: "
+ "%ld pages, ino %lu; err %d\n", __func__,
+ wbc->nr_to_write, inode->i_ino, ret);
+ dump_stack();
+ goto out_writepages;
+ }
+ to_write -= wbc->nr_to_write;
+
+ mpd.get_block = ext4_da_get_block_write;
+ ret = mpage_da_writepages(mapping, wbc, &mpd);
+
+ ext4_journal_stop(handle);
+
+ if (mpd.retval == -ENOSPC)
+ jbd2_journal_force_commit_nested(sbi->s_journal);
+
+ /* reset the retry count */
+ if (ret == MPAGE_DA_EXTENT_TAIL) {
+ /*
+ * got one extent now try with
+ * rest of the pages
+ */
+ to_write += wbc->nr_to_write;
+ ret = 0;
+ } else if (wbc->nr_to_write) {
+ /*
+ * There is no more writeout needed
+ * or we requested for a noblocking writeout
+ * and we found the device congested
+ */
+ to_write += wbc->nr_to_write;
+ break;
+ }
+ wbc->nr_to_write = to_write;
+ }
+
+ if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) {
+ /* We skipped pages in this loop */
+ wbc->range_start = range_start;
+ wbc->nr_to_write = to_write +
+ wbc->pages_skipped - pages_skipped;
+ wbc->pages_skipped = pages_skipped;
+ goto restart_loop;
+ }
+
+out_writepages:
+ wbc->nr_to_write = to_write - nr_to_writebump;
+ wbc->range_start = range_start;
+ return ret;
+}
+
+#define FALL_BACK_TO_NONDELALLOC 1
+static int ext4_nonda_switch(struct super_block *sb)
+{
+ s64 free_blocks, dirty_blocks;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ /*
+ * switch to non delalloc mode if we are running low
+ * on free block. The free block accounting via percpu
+ * counters can get slightly wrong with FBC_BATCH getting
+ * accumulated on each CPU without updating global counters
+ * Delalloc need an accurate free block accounting. So switch
+ * to non delalloc when we are near to error range.
+ */
+ free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
+ dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter);
+ if (2 * free_blocks < 3 * dirty_blocks ||
+ free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
+ /*
+ * free block count is less that 150% of dirty blocks
+ * or free blocks is less that watermark
+ */
+ return 1;
+ }
+ return 0;
+}
+
+static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ int ret, retries = 0;
+ struct page *page;
+ pgoff_t index;
+ unsigned from, to;
+ struct inode *inode = mapping->host;
+ handle_t *handle;
+
+ index = pos >> PAGE_CACHE_SHIFT;
+ from = pos & (PAGE_CACHE_SIZE - 1);
+ to = from + len;
+
+ if (ext4_nonda_switch(inode->i_sb)) {
+ *fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
+ return ext4_write_begin(file, mapping, pos,
+ len, flags, pagep, fsdata);
+ }
+ *fsdata = (void *)0;
+retry:
+ /*
+ * With delayed allocation, we don't log the i_disksize update
+ * if there is delayed block allocation. But we still need
+ * to journalling the i_disksize update if writes to the end
+ * of file which has an already mapped buffer.
+ */
+ handle = ext4_journal_start(inode, 1);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out;
+ }
+
+ page = __grab_cache_page(mapping, index);
+ if (!page) {
+ ext4_journal_stop(handle);
+ ret = -ENOMEM;
+ goto out;
+ }
+ *pagep = page;
+
+ ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+ ext4_da_get_block_prep);
+ if (ret < 0) {
+ unlock_page(page);
+ ext4_journal_stop(handle);
+ page_cache_release(page);
+ /*
+ * block_write_begin may have instantiated a few blocks
+ * outside i_size. Trim these off again. Don't need
+ * i_size_read because we hold i_mutex.
+ */
+ if (pos + len > inode->i_size)
+ vmtruncate(inode, inode->i_size);
+ }
+
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+out:
+ return ret;
+}
+
+/*
+ * Check if we should update i_disksize
+ * when write to the end of file but not require block allocation
+ */
+static int ext4_da_should_update_i_disksize(struct page *page,
+ unsigned long offset)
+{
+ struct buffer_head *bh;
+ struct inode *inode = page->mapping->host;
+ unsigned int idx;
+ int i;
+
+ bh = page_buffers(page);
+ idx = offset >> inode->i_blkbits;
+
+ for (i = 0; i < idx; i++)
+ bh = bh->b_this_page;
+
+ if (!buffer_mapped(bh) || (buffer_delay(bh)))
+ return 0;
+ return 1;
+}
+
+static int ext4_da_write_end(struct file *file,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ struct inode *inode = mapping->host;
+ int ret = 0, ret2;
+ handle_t *handle = ext4_journal_current_handle();
+ loff_t new_i_size;
+ unsigned long start, end;
+ int write_mode = (int)(unsigned long)fsdata;
+
+ if (write_mode == FALL_BACK_TO_NONDELALLOC) {
+ if (ext4_should_order_data(inode)) {
+ return ext4_ordered_write_end(file, mapping, pos,
+ len, copied, page, fsdata);
+ } else if (ext4_should_writeback_data(inode)) {
+ return ext4_writeback_write_end(file, mapping, pos,
+ len, copied, page, fsdata);
+ } else {
+ BUG();
+ }
+ }
+
+ start = pos & (PAGE_CACHE_SIZE - 1);
+ end = start + copied - 1;
+
+ /*
+ * generic_write_end() will run mark_inode_dirty() if i_size
+ * changes. So let's piggyback the i_disksize mark_inode_dirty
+ * into that.
+ */
+
+ new_i_size = pos + copied;
+ if (new_i_size > EXT4_I(inode)->i_disksize) {
+ if (ext4_da_should_update_i_disksize(page, end)) {
+ down_write(&EXT4_I(inode)->i_data_sem);
+ if (new_i_size > EXT4_I(inode)->i_disksize) {
+ /*
+ * Updating i_disksize when extending file
+ * without needing block allocation
+ */
+ if (ext4_should_order_data(inode))
+ ret = ext4_jbd2_file_inode(handle,
+ inode);
+
+ EXT4_I(inode)->i_disksize = new_i_size;
+ }
+ up_write(&EXT4_I(inode)->i_data_sem);
+ /* We need to mark inode dirty even if
+ * new_i_size is less that inode->i_size
+ * bu greater than i_disksize.(hint delalloc)
+ */
+ ext4_mark_inode_dirty(handle, inode);
+ }
+ }
+ ret2 = generic_write_end(file, mapping, pos, len, copied,
+ page, fsdata);
+ copied = ret2;
+ if (ret2 < 0)
+ ret = ret2;
+ ret2 = ext4_journal_stop(handle);
+ if (!ret)
+ ret = ret2;
+
+ return ret ? ret : copied;
+}
+
+static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
+{
+ /*
+ * Drop reserved blocks
+ */
+ BUG_ON(!PageLocked(page));
+ if (!page_has_buffers(page))
+ goto out;
+
+ ext4_da_page_release_reservation(page, offset);
+
+out:
+ ext4_invalidatepage(page, offset);
+
+ return;
+}
+
+
/*
* bmap() is special. It gets used by applications such as lilo and by
* the swapper to find the on-disk block of a specific piece of data.
@@ -1418,6 +2686,16 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
journal_t *journal;
int err;
+ if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
+ test_opt(inode->i_sb, DELALLOC)) {
+ /*
+ * With delalloc we want to sync the file
+ * so that we can make sure we allocate
+ * blocks for file
+ */
+ filemap_write_and_wait(mapping);
+ }
+
if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
/*
* This is a REALLY heavyweight approach, but the use of
@@ -1447,7 +2725,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
return 0;
}
- return generic_block_bmap(mapping,block,ext4_get_block);
+ return generic_block_bmap(mapping, block, ext4_get_block);
}
static int bget_one(handle_t *handle, struct buffer_head *bh)
@@ -1462,21 +2740,17 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
return 0;
}
-static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
-{
- if (buffer_mapped(bh))
- return ext4_journal_dirty_data(handle, bh);
- return 0;
-}
-
/*
- * Note that we always start a transaction even if we're not journalling
- * data. This is to preserve ordering: any hole instantiation within
- * __block_write_full_page -> ext4_get_block() should be journalled
- * along with the data so we don't crash and then get metadata which
- * refers to old data.
+ * Note that we don't need to start a transaction unless we're journaling data
+ * because we should have holes filled from ext4_page_mkwrite(). We even don't
+ * need to file the inode to the transaction's list in ordered mode because if
+ * we are writing back data added by write(), the inode is already there and if
+ * we are writing back data modified via mmap(), noone guarantees in which
+ * transaction the data will hit the disk. In case we are journaling data, we
+ * cannot start transaction directly because transaction start ranks above page
+ * lock so we have to do some magic.
*
- * In all journalling modes block_write_full_page() will start the I/O.
+ * In all journaling modes block_write_full_page() will start the I/O.
*
* Problem:
*
@@ -1518,105 +2792,103 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
* disastrous. Any write() or metadata operation will sync the fs for
* us.
*
- * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
- * we don't need to open a transaction here.
*/
-static int ext4_ordered_writepage(struct page *page,
+static int __ext4_normal_writepage(struct page *page,
struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
- struct buffer_head *page_bufs;
- handle_t *handle = NULL;
- int ret = 0;
- int err;
- J_ASSERT(PageLocked(page));
-
- /*
- * We give up here if we're reentered, because it might be for a
- * different filesystem.
- */
- if (ext4_journal_current_handle())
- goto out_fail;
-
- handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+ if (test_opt(inode->i_sb, NOBH))
+ return nobh_writepage(page,
+ ext4_normal_get_block_write, wbc);
+ else
+ return block_write_full_page(page,
+ ext4_normal_get_block_write,
+ wbc);
+}
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out_fail;
- }
+static int ext4_normal_writepage(struct page *page,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = page->mapping->host;
+ loff_t size = i_size_read(inode);
+ loff_t len;
- if (!page_has_buffers(page)) {
- create_empty_buffers(page, inode->i_sb->s_blocksize,
- (1 << BH_Dirty)|(1 << BH_Uptodate));
+ J_ASSERT(PageLocked(page));
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+
+ if (page_has_buffers(page)) {
+ /* if page has buffers it should all be mapped
+ * and allocated. If there are not buffers attached
+ * to the page we know the page is dirty but it lost
+ * buffers. That means that at some moment in time
+ * after write_begin() / write_end() has been called
+ * all buffers have been clean and thus they must have been
+ * written at least once. So they are all mapped and we can
+ * happily proceed with mapping them and writing the page.
+ */
+ BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+ ext4_bh_unmapped_or_delay));
}
- page_bufs = page_buffers(page);
- walk_page_buffers(handle, page_bufs, 0,
- PAGE_CACHE_SIZE, NULL, bget_one);
- ret = block_write_full_page(page, ext4_get_block, wbc);
-
- /*
- * The page can become unlocked at any point now, and
- * truncate can then come in and change things. So we
- * can't touch *page from now on. But *page_bufs is
- * safe due to elevated refcount.
- */
+ if (!ext4_journal_current_handle())
+ return __ext4_normal_writepage(page, wbc);
- /*
- * And attach them to the current transaction. But only if
- * block_write_full_page() succeeded. Otherwise they are unmapped,
- * and generally junk.
- */
- if (ret == 0) {
- err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
- NULL, jbd2_journal_dirty_data_fn);
- if (!ret)
- ret = err;
- }
- walk_page_buffers(handle, page_bufs, 0,
- PAGE_CACHE_SIZE, NULL, bput_one);
- err = ext4_journal_stop(handle);
- if (!ret)
- ret = err;
- return ret;
-
-out_fail:
redirty_page_for_writepage(wbc, page);
unlock_page(page);
- return ret;
+ return 0;
}
-static int ext4_writeback_writepage(struct page *page,
+static int __ext4_journalled_writepage(struct page *page,
struct writeback_control *wbc)
{
- struct inode *inode = page->mapping->host;
+ struct address_space *mapping = page->mapping;
+ struct inode *inode = mapping->host;
+ struct buffer_head *page_bufs;
handle_t *handle = NULL;
int ret = 0;
int err;
- if (ext4_journal_current_handle())
- goto out_fail;
+ ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+ ext4_normal_get_block_write);
+ if (ret != 0)
+ goto out_unlock;
+
+ page_bufs = page_buffers(page);
+ walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
+ bget_one);
+ /* As soon as we unlock the page, it can go away, but we have
+ * references to buffers so we are safe */
+ unlock_page(page);
handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
- goto out_fail;
+ goto out;
}
- if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
- ret = nobh_writepage(page, ext4_get_block, wbc);
- else
- ret = block_write_full_page(page, ext4_get_block, wbc);
+ ret = walk_page_buffers(handle, page_bufs, 0,
+ PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
+ err = walk_page_buffers(handle, page_bufs, 0,
+ PAGE_CACHE_SIZE, NULL, write_end_fn);
+ if (ret == 0)
+ ret = err;
err = ext4_journal_stop(handle);
if (!ret)
ret = err;
- return ret;
-out_fail:
- redirty_page_for_writepage(wbc, page);
+ walk_page_buffers(handle, page_bufs, 0,
+ PAGE_CACHE_SIZE, NULL, bput_one);
+ EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+ goto out;
+
+out_unlock:
unlock_page(page);
+out:
return ret;
}
@@ -1624,59 +2896,53 @@ static int ext4_journalled_writepage(struct page *page,
struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
- handle_t *handle = NULL;
- int ret = 0;
- int err;
+ loff_t size = i_size_read(inode);
+ loff_t len;
- if (ext4_journal_current_handle())
- goto no_write;
+ J_ASSERT(PageLocked(page));
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+
+ if (page_has_buffers(page)) {
+ /* if page has buffers it should all be mapped
+ * and allocated. If there are not buffers attached
+ * to the page we know the page is dirty but it lost
+ * buffers. That means that at some moment in time
+ * after write_begin() / write_end() has been called
+ * all buffers have been clean and thus they must have been
+ * written at least once. So they are all mapped and we can
+ * happily proceed with mapping them and writing the page.
+ */
+ BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+ ext4_bh_unmapped_or_delay));
+ }
- handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
+ if (ext4_journal_current_handle())
goto no_write;
- }
- if (!page_has_buffers(page) || PageChecked(page)) {
+ if (PageChecked(page)) {
/*
* It's mmapped pagecache. Add buffers and journal it. There
* doesn't seem much point in redirtying the page here.
*/
ClearPageChecked(page);
- ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
- ext4_get_block);
- if (ret != 0) {
- ext4_journal_stop(handle);
- goto out_unlock;
- }
- ret = walk_page_buffers(handle, page_buffers(page), 0,
- PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
-
- err = walk_page_buffers(handle, page_buffers(page), 0,
- PAGE_CACHE_SIZE, NULL, write_end_fn);
- if (ret == 0)
- ret = err;
- EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
- unlock_page(page);
+ return __ext4_journalled_writepage(page, wbc);
} else {
/*
* It may be a page full of checkpoint-mode buffers. We don't
* really know unless we go poke around in the buffer_heads.
* But block_write_full_page will do the right thing.
*/
- ret = block_write_full_page(page, ext4_get_block, wbc);
+ return block_write_full_page(page,
+ ext4_normal_get_block_write,
+ wbc);
}
- err = ext4_journal_stop(handle);
- if (!ret)
- ret = err;
-out:
- return ret;
-
no_write:
redirty_page_for_writepage(wbc, page);
-out_unlock:
unlock_page(page);
- goto out;
+ return 0;
}
static int ext4_readpage(struct file *file, struct page *page)
@@ -1817,50 +3083,75 @@ static int ext4_journalled_set_page_dirty(struct page *page)
}
static const struct address_space_operations ext4_ordered_aops = {
- .readpage = ext4_readpage,
- .readpages = ext4_readpages,
- .writepage = ext4_ordered_writepage,
- .sync_page = block_sync_page,
- .write_begin = ext4_write_begin,
- .write_end = ext4_ordered_write_end,
- .bmap = ext4_bmap,
- .invalidatepage = ext4_invalidatepage,
- .releasepage = ext4_releasepage,
- .direct_IO = ext4_direct_IO,
- .migratepage = buffer_migrate_page,
+ .readpage = ext4_readpage,
+ .readpages = ext4_readpages,
+ .writepage = ext4_normal_writepage,
+ .sync_page = block_sync_page,
+ .write_begin = ext4_write_begin,
+ .write_end = ext4_ordered_write_end,
+ .bmap = ext4_bmap,
+ .invalidatepage = ext4_invalidatepage,
+ .releasepage = ext4_releasepage,
+ .direct_IO = ext4_direct_IO,
+ .migratepage = buffer_migrate_page,
+ .is_partially_uptodate = block_is_partially_uptodate,
};
static const struct address_space_operations ext4_writeback_aops = {
- .readpage = ext4_readpage,
- .readpages = ext4_readpages,
- .writepage = ext4_writeback_writepage,
- .sync_page = block_sync_page,
- .write_begin = ext4_write_begin,
- .write_end = ext4_writeback_write_end,
- .bmap = ext4_bmap,
- .invalidatepage = ext4_invalidatepage,
- .releasepage = ext4_releasepage,
- .direct_IO = ext4_direct_IO,
- .migratepage = buffer_migrate_page,
+ .readpage = ext4_readpage,
+ .readpages = ext4_readpages,
+ .writepage = ext4_normal_writepage,
+ .sync_page = block_sync_page,
+ .write_begin = ext4_write_begin,
+ .write_end = ext4_writeback_write_end,
+ .bmap = ext4_bmap,
+ .invalidatepage = ext4_invalidatepage,
+ .releasepage = ext4_releasepage,
+ .direct_IO = ext4_direct_IO,
+ .migratepage = buffer_migrate_page,
+ .is_partially_uptodate = block_is_partially_uptodate,
};
static const struct address_space_operations ext4_journalled_aops = {
- .readpage = ext4_readpage,
- .readpages = ext4_readpages,
- .writepage = ext4_journalled_writepage,
- .sync_page = block_sync_page,
- .write_begin = ext4_write_begin,
- .write_end = ext4_journalled_write_end,
- .set_page_dirty = ext4_journalled_set_page_dirty,
- .bmap = ext4_bmap,
- .invalidatepage = ext4_invalidatepage,
- .releasepage = ext4_releasepage,
+ .readpage = ext4_readpage,
+ .readpages = ext4_readpages,
+ .writepage = ext4_journalled_writepage,
+ .sync_page = block_sync_page,
+ .write_begin = ext4_write_begin,
+ .write_end = ext4_journalled_write_end,
+ .set_page_dirty = ext4_journalled_set_page_dirty,
+ .bmap = ext4_bmap,
+ .invalidatepage = ext4_invalidatepage,
+ .releasepage = ext4_releasepage,
+ .is_partially_uptodate = block_is_partially_uptodate,
+};
+
+static const struct address_space_operations ext4_da_aops = {
+ .readpage = ext4_readpage,
+ .readpages = ext4_readpages,
+ .writepage = ext4_da_writepage,
+ .writepages = ext4_da_writepages,
+ .sync_page = block_sync_page,
+ .write_begin = ext4_da_write_begin,
+ .write_end = ext4_da_write_end,
+ .bmap = ext4_bmap,
+ .invalidatepage = ext4_da_invalidatepage,
+ .releasepage = ext4_releasepage,
+ .direct_IO = ext4_direct_IO,
+ .migratepage = buffer_migrate_page,
+ .is_partially_uptodate = block_is_partially_uptodate,
};
void ext4_set_aops(struct inode *inode)
{
- if (ext4_should_order_data(inode))
+ if (ext4_should_order_data(inode) &&
+ test_opt(inode->i_sb, DELALLOC))
+ inode->i_mapping->a_ops = &ext4_da_aops;
+ else if (ext4_should_order_data(inode))
inode->i_mapping->a_ops = &ext4_ordered_aops;
+ else if (ext4_should_writeback_data(inode) &&
+ test_opt(inode->i_sb, DELALLOC))
+ inode->i_mapping->a_ops = &ext4_da_aops;
else if (ext4_should_writeback_data(inode))
inode->i_mapping->a_ops = &ext4_writeback_aops;
else
@@ -1873,7 +3164,7 @@ void ext4_set_aops(struct inode *inode)
* This required during truncate. We need to physically zero the tail end
* of that block so it doesn't yield old data if the file is later grown.
*/
-int ext4_block_truncate_page(handle_t *handle, struct page *page,
+int ext4_block_truncate_page(handle_t *handle,
struct address_space *mapping, loff_t from)
{
ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
@@ -1882,8 +3173,13 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
ext4_lblk_t iblock;
struct inode *inode = mapping->host;
struct buffer_head *bh;
+ struct page *page;
int err = 0;
+ page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT);
+ if (!page)
+ return -EINVAL;
+
blocksize = inode->i_sb->s_blocksize;
length = blocksize - (offset & (blocksize - 1));
iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
@@ -1956,7 +3252,7 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
err = ext4_journal_dirty_metadata(handle, bh);
} else {
if (ext4_should_order_data(inode))
- err = ext4_journal_dirty_data(handle, bh);
+ err = ext4_jbd2_file_inode(handle, inode);
mark_buffer_dirty(bh);
}
@@ -2035,7 +3331,7 @@ static Indirect *ext4_find_shared(struct inode *inode, int depth,
if (!partial->key && *partial->p)
/* Writer: end */
goto no_top;
- for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
+ for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--)
;
/*
* OK, we've found the last block that must survive. The rest of our
@@ -2054,7 +3350,7 @@ static Indirect *ext4_find_shared(struct inode *inode, int depth,
}
/* Writer: end */
- while(partial > p) {
+ while (partial > p) {
brelse(partial->bh);
partial--;
}
@@ -2179,7 +3475,21 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
if (this_bh) {
BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");
- ext4_journal_dirty_metadata(handle, this_bh);
+
+ /*
+ * The buffer head should have an attached journal head at this
+ * point. However, if the data is corrupted and an indirect
+ * block pointed to itself, it would have been detached when
+ * the block was cleared. Check for this instead of OOPSing.
+ */
+ if (bh2jh(this_bh))
+ ext4_journal_dirty_metadata(handle, this_bh);
+ else
+ ext4_error(inode->i_sb, __func__,
+ "circular indirect block detected, "
+ "inode=%lu, block=%llu",
+ inode->i_ino,
+ (unsigned long long) this_bh->b_blocknr);
}
}
@@ -2232,9 +3542,9 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
/* This zaps the entire block. Bottom up. */
BUFFER_TRACE(bh, "free child branches");
ext4_free_branches(handle, inode, bh,
- (__le32*)bh->b_data,
- (__le32*)bh->b_data + addr_per_block,
- depth);
+ (__le32 *) bh->b_data,
+ (__le32 *) bh->b_data + addr_per_block,
+ depth);
/*
* We've probably journalled the indirect block several
@@ -2305,6 +3615,19 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
}
}
+int ext4_can_truncate(struct inode *inode)
+{
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ return 0;
+ if (S_ISREG(inode->i_mode))
+ return 1;
+ if (S_ISDIR(inode->i_mode))
+ return 1;
+ if (S_ISLNK(inode->i_mode))
+ return !ext4_inode_is_fast_symlink(inode);
+ return 0;
+}
+
/*
* ext4_truncate()
*
@@ -2347,51 +3670,25 @@ void ext4_truncate(struct inode *inode)
int n;
ext4_lblk_t last_block;
unsigned blocksize = inode->i_sb->s_blocksize;
- struct page *page;
- if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- S_ISLNK(inode->i_mode)))
+ if (!ext4_can_truncate(inode))
return;
- if (ext4_inode_is_fast_symlink(inode))
- return;
- if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
- return;
-
- /*
- * We have to lock the EOF page here, because lock_page() nests
- * outside jbd2_journal_start().
- */
- if ((inode->i_size & (blocksize - 1)) == 0) {
- /* Block boundary? Nothing to do */
- page = NULL;
- } else {
- page = grab_cache_page(mapping,
- inode->i_size >> PAGE_CACHE_SHIFT);
- if (!page)
- return;
- }
if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
- ext4_ext_truncate(inode, page);
+ ext4_ext_truncate(inode);
return;
}
handle = start_transaction(inode);
- if (IS_ERR(handle)) {
- if (page) {
- clear_highpage(page);
- flush_dcache_page(page);
- unlock_page(page);
- page_cache_release(page);
- }
+ if (IS_ERR(handle))
return; /* AKPM: return what? */
- }
last_block = (inode->i_size + blocksize-1)
>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
- if (page)
- ext4_block_truncate_page(handle, page, mapping, inode->i_size);
+ if (inode->i_size & (blocksize - 1))
+ if (ext4_block_truncate_page(handle, mapping, inode->i_size))
+ goto out_stop;
n = ext4_block_to_path(inode, last_block, offsets, NULL);
if (n == 0)
@@ -2410,6 +3707,14 @@ void ext4_truncate(struct inode *inode)
goto out_stop;
/*
+ * From here we block out all ext4_get_block() callers who want to
+ * modify the block allocation tree.
+ */
+ down_write(&ei->i_data_sem);
+
+ ext4_discard_preallocations(inode);
+
+ /*
* The orphan list entry will now protect us from any crash which
* occurs before the truncate completes, so it is now safe to propagate
* the new, shorter inode size (held for now in i_size) into the
@@ -2418,12 +3723,6 @@ void ext4_truncate(struct inode *inode)
*/
ei->i_disksize = inode->i_size;
- /*
- * From here we block out all ext4_get_block() callers who want to
- * modify the block allocation tree.
- */
- down_write(&ei->i_data_sem);
-
if (n == 1) { /* direct blocks */
ext4_free_data(handle, inode, NULL, i_data+offsets[0],
i_data + EXT4_NDIR_BLOCKS);
@@ -2484,8 +3783,6 @@ do_indirects:
;
}
- ext4_discard_reservation(inode);
-
up_write(&ei->i_data_sem);
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
ext4_mark_inode_dirty(handle, inode);
@@ -2510,41 +3807,6 @@ out_stop:
ext4_journal_stop(handle);
}
-static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
- unsigned long ino, struct ext4_iloc *iloc)
-{
- ext4_group_t block_group;
- unsigned long offset;
- ext4_fsblk_t block;
- struct ext4_group_desc *gdp;
-
- if (!ext4_valid_inum(sb, ino)) {
- /*
- * This error is already checked for in namei.c unless we are
- * looking at an NFS filehandle, in which case no error
- * report is needed
- */
- return 0;
- }
-
- block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
- gdp = ext4_get_group_desc(sb, block_group, NULL);
- if (!gdp)
- return 0;
-
- /*
- * Figure out the offset within the block group inode table
- */
- offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)) *
- EXT4_INODE_SIZE(sb);
- block = ext4_inode_table(sb, gdp) +
- (offset >> EXT4_BLOCK_SIZE_BITS(sb));
-
- iloc->block_group = block_group;
- iloc->offset = offset & (EXT4_BLOCK_SIZE(sb) - 1);
- return block;
-}
-
/*
* ext4_get_inode_loc returns with an extra refcount against the inode's
* underlying buffer_head on success. If 'in_mem' is true, we have all
@@ -2554,23 +3816,49 @@ static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb,
static int __ext4_get_inode_loc(struct inode *inode,
struct ext4_iloc *iloc, int in_mem)
{
- ext4_fsblk_t block;
- struct buffer_head *bh;
+ struct ext4_group_desc *gdp;
+ struct buffer_head *bh;
+ struct super_block *sb = inode->i_sb;
+ ext4_fsblk_t block;
+ int inodes_per_block, inode_offset;
+
+ iloc->bh = 0;
+ if (!ext4_valid_inum(sb, inode->i_ino))
+ return -EIO;
- block = ext4_get_inode_block(inode->i_sb, inode->i_ino, iloc);
- if (!block)
+ iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
+ gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
+ if (!gdp)
return -EIO;
- bh = sb_getblk(inode->i_sb, block);
+ /*
+ * Figure out the offset within the block group inode table
+ */
+ inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb));
+ inode_offset = ((inode->i_ino - 1) %
+ EXT4_INODES_PER_GROUP(sb));
+ block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
+ iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
+
+ bh = sb_getblk(sb, block);
if (!bh) {
- ext4_error (inode->i_sb, "ext4_get_inode_loc",
- "unable to read inode block - "
- "inode=%lu, block=%llu",
- inode->i_ino, block);
+ ext4_error(sb, "ext4_get_inode_loc", "unable to read "
+ "inode block - inode=%lu, block=%llu",
+ inode->i_ino, block);
return -EIO;
}
if (!buffer_uptodate(bh)) {
lock_buffer(bh);
+
+ /*
+ * If the buffer has the write error flag, we have failed
+ * to write out another inode in the same block. In this
+ * case, we don't have to read the block because we may
+ * read the old inode data successfully.
+ */
+ if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
+ set_buffer_uptodate(bh);
+
if (buffer_uptodate(bh)) {
/* someone brought it uptodate while we waited */
unlock_buffer(bh);
@@ -2584,28 +3872,12 @@ static int __ext4_get_inode_loc(struct inode *inode,
*/
if (in_mem) {
struct buffer_head *bitmap_bh;
- struct ext4_group_desc *desc;
- int inodes_per_buffer;
- int inode_offset, i;
- ext4_group_t block_group;
- int start;
-
- block_group = (inode->i_ino - 1) /
- EXT4_INODES_PER_GROUP(inode->i_sb);
- inodes_per_buffer = bh->b_size /
- EXT4_INODE_SIZE(inode->i_sb);
- inode_offset = ((inode->i_ino - 1) %
- EXT4_INODES_PER_GROUP(inode->i_sb));
- start = inode_offset & ~(inodes_per_buffer - 1);
+ int i, start;
- /* Is the inode bitmap in cache? */
- desc = ext4_get_group_desc(inode->i_sb,
- block_group, NULL);
- if (!desc)
- goto make_io;
+ start = inode_offset & ~(inodes_per_block - 1);
- bitmap_bh = sb_getblk(inode->i_sb,
- ext4_inode_bitmap(inode->i_sb, desc));
+ /* Is the inode bitmap in cache? */
+ bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
if (!bitmap_bh)
goto make_io;
@@ -2618,14 +3890,14 @@ static int __ext4_get_inode_loc(struct inode *inode,
brelse(bitmap_bh);
goto make_io;
}
- for (i = start; i < start + inodes_per_buffer; i++) {
+ for (i = start; i < start + inodes_per_block; i++) {
if (i == inode_offset)
continue;
if (ext4_test_bit(i, bitmap_bh->b_data))
break;
}
brelse(bitmap_bh);
- if (i == start + inodes_per_buffer) {
+ if (i == start + inodes_per_block) {
/* all other inodes are free, so skip I/O */
memset(bh->b_data, 0, bh->b_size);
set_buffer_uptodate(bh);
@@ -2636,6 +3908,36 @@ static int __ext4_get_inode_loc(struct inode *inode,
make_io:
/*
+ * If we need to do any I/O, try to pre-readahead extra
+ * blocks from the inode table.
+ */
+ if (EXT4_SB(sb)->s_inode_readahead_blks) {
+ ext4_fsblk_t b, end, table;
+ unsigned num;
+
+ table = ext4_inode_table(sb, gdp);
+ /* Make sure s_inode_readahead_blks is a power of 2 */
+ while (EXT4_SB(sb)->s_inode_readahead_blks &
+ (EXT4_SB(sb)->s_inode_readahead_blks-1))
+ EXT4_SB(sb)->s_inode_readahead_blks =
+ (EXT4_SB(sb)->s_inode_readahead_blks &
+ (EXT4_SB(sb)->s_inode_readahead_blks-1));
+ b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
+ if (table > b)
+ b = table;
+ end = b + EXT4_SB(sb)->s_inode_readahead_blks;
+ num = EXT4_INODES_PER_GROUP(sb);
+ if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+ num -= le16_to_cpu(gdp->bg_itable_unused);
+ table += num / inodes_per_block;
+ if (end > table)
+ end = table;
+ while (b <= end)
+ sb_breadahead(sb, b++);
+ }
+
+ /*
* There are other valid inodes in the buffer, this inode
* has in-inode xattrs, or we don't have this inode in memory.
* Read the block from disk.
@@ -2645,10 +3947,9 @@ make_io:
submit_bh(READ_META, bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
- ext4_error(inode->i_sb, "ext4_get_inode_loc",
- "unable to read inode block - "
- "inode=%lu, block=%llu",
- inode->i_ino, block);
+ ext4_error(sb, __func__,
+ "unable to read inode block - inode=%lu, "
+ "block=%llu", inode->i_ino, block);
brelse(bh);
return -EIO;
}
@@ -2740,11 +4041,10 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
return inode;
ei = EXT4_I(inode);
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
ei->i_acl = EXT4_ACL_NOT_CACHED;
ei->i_default_acl = EXT4_ACL_NOT_CACHED;
#endif
- ei->i_block_alloc_info = NULL;
ret = __ext4_get_inode_loc(inode, &iloc, 0);
if (ret < 0)
@@ -2754,7 +4054,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
inode->i_mode = le16_to_cpu(raw_inode->i_mode);
inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
- if(!(test_opt (inode->i_sb, NO_UID32))) {
+ if (!(test_opt(inode->i_sb, NO_UID32))) {
inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
}
@@ -2772,7 +4072,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
if (inode->i_mode == 0 ||
!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
/* this inode is deleted */
- brelse (bh);
+ brelse(bh);
ret = -ESTALE;
goto bad_inode;
}
@@ -2805,7 +4105,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
EXT4_INODE_SIZE(inode->i_sb)) {
- brelse (bh);
+ brelse(bh);
ret = -EIO;
goto bad_inode;
}
@@ -2858,7 +4158,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
init_special_inode(inode, inode->i_mode,
new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
}
- brelse (iloc.bh);
+ brelse(iloc.bh);
ext4_set_inode_flags(inode);
unlock_new_inode(inode);
return inode;
@@ -2940,14 +4240,14 @@ static int ext4_do_update_inode(handle_t *handle,
ext4_get_inode_flags(ei);
raw_inode->i_mode = cpu_to_le16(inode->i_mode);
- if(!(test_opt(inode->i_sb, NO_UID32))) {
+ if (!(test_opt(inode->i_sb, NO_UID32))) {
raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
/*
* Fix up interoperability with old kernels. Otherwise, old inodes get
* re-used with the upper 16 bits of the uid/gid intact
*/
- if(!ei->i_dtime) {
+ if (!ei->i_dtime) {
raw_inode->i_uid_high =
cpu_to_le16(high_16_bits(inode->i_uid));
raw_inode->i_gid_high =
@@ -3035,7 +4335,7 @@ static int ext4_do_update_inode(handle_t *handle,
ei->i_state &= ~EXT4_STATE_NEW;
out_brelse:
- brelse (bh);
+ brelse(bh);
ext4_std_error(inode->i_sb, err);
return err;
}
@@ -3107,7 +4407,14 @@ int ext4_write_inode(struct inode *inode, int wait)
* be freed, so we have a strong guarantee that no future commit will
* leave these blocks visible to the user.)
*
- * Called with inode->sem down.
+ * Another thing we have to assure is that if we are in ordered mode
+ * and inode is still attached to the committing transaction, we must
+ * we start writeout of all the dirty pages which are being truncated.
+ * This way we are sure that all the data written in the previous
+ * transaction are already on disk (truncate waits for pages under
+ * writeback).
+ *
+ * Called with inode->i_mutex down.
*/
int ext4_setattr(struct dentry *dentry, struct iattr *attr)
{
@@ -3173,6 +4480,22 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (!error)
error = rc;
ext4_journal_stop(handle);
+
+ if (ext4_should_order_data(inode)) {
+ error = ext4_begin_ordered_truncate(inode,
+ attr->ia_size);
+ if (error) {
+ /* Do as much error cleanup as possible */
+ handle = ext4_journal_start(inode, 3);
+ if (IS_ERR(handle)) {
+ ext4_orphan_del(NULL, inode);
+ goto err_out;
+ }
+ ext4_orphan_del(handle, inode);
+ ext4_journal_stop(handle);
+ goto err_out;
+ }
+ }
}
rc = inode_setattr(inode, attr);
@@ -3193,58 +4516,156 @@ err_out:
return error;
}
+int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ struct inode *inode;
+ unsigned long delalloc_blocks;
+
+ inode = dentry->d_inode;
+ generic_fillattr(inode, stat);
+ /*
+ * We can't update i_blocks if the block allocation is delayed
+ * otherwise in the case of system crash before the real block
+ * allocation is done, we will have i_blocks inconsistent with
+ * on-disk file blocks.
+ * We always keep i_blocks updated together with real
+ * allocation. But to not confuse with user, stat
+ * will return the blocks that include the delayed allocation
+ * blocks for this file.
+ */
+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+ delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+
+ stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
+ return 0;
+}
+
+static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
+ int chunk)
+{
+ int indirects;
+
+ /* if nrblocks are contiguous */
+ if (chunk) {
+ /*
+ * With N contiguous data blocks, it need at most
+ * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks
+ * 2 dindirect blocks
+ * 1 tindirect block
+ */
+ indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb);
+ return indirects + 3;
+ }
+ /*
+ * if nrblocks are not contiguous, worse case, each block touch
+ * a indirect block, and each indirect block touch a double indirect
+ * block, plus a triple indirect block
+ */
+ indirects = nrblocks * 2 + 1;
+ return indirects;
+}
+
+static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+{
+ if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+ return ext4_indirect_trans_blocks(inode, nrblocks, 0);
+ return ext4_ext_index_trans_blocks(inode, nrblocks, 0);
+}
/*
- * How many blocks doth make a writepage()?
- *
- * With N blocks per page, it may be:
- * N data blocks
- * 2 indirect block
- * 2 dindirect
- * 1 tindirect
- * N+5 bitmap blocks (from the above)
- * N+5 group descriptor summary blocks
- * 1 inode block
- * 1 superblock.
- * 2 * EXT4_SINGLEDATA_TRANS_BLOCKS for the quote files
- *
- * 3 * (N + 5) + 2 + 2 * EXT4_SINGLEDATA_TRANS_BLOCKS
- *
- * With ordered or writeback data it's the same, less the N data blocks.
- *
- * If the inode's direct blocks can hold an integral number of pages then a
- * page cannot straddle two indirect blocks, and we can only touch one indirect
- * and dindirect block, and the "5" above becomes "3".
- *
- * This still overestimates under most circumstances. If we were to pass the
- * start and end offsets in here as well we could do block_to_path() on each
- * block and work out the exact number of indirects which are touched. Pah.
+ * Account for index blocks, block groups bitmaps and block group
+ * descriptor blocks if modify datablocks and index blocks
+ * worse case, the indexs blocks spread over different block groups
+ *
+ * If datablocks are discontiguous, they are possible to spread over
+ * different block groups too. If they are contiugous, with flexbg,
+ * they could still across block group boundary.
+ *
+ * Also account for superblock, inode, quota and xattr blocks
*/
+int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
+{
+ int groups, gdpblocks;
+ int idxblocks;
+ int ret = 0;
+
+ /*
+ * How many index blocks need to touch to modify nrblocks?
+ * The "Chunk" flag indicating whether the nrblocks is
+ * physically contiguous on disk
+ *
+ * For Direct IO and fallocate, they calls get_block to allocate
+ * one single extent at a time, so they could set the "Chunk" flag
+ */
+ idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk);
+
+ ret = idxblocks;
+
+ /*
+ * Now let's see how many group bitmaps and group descriptors need
+ * to account
+ */
+ groups = idxblocks;
+ if (chunk)
+ groups += 1;
+ else
+ groups += nrblocks;
+
+ gdpblocks = groups;
+ if (groups > EXT4_SB(inode->i_sb)->s_groups_count)
+ groups = EXT4_SB(inode->i_sb)->s_groups_count;
+ if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
+ gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
+ /* bitmaps and block group descriptor blocks */
+ ret += groups + gdpblocks;
+
+ /* Blocks for super block, inode, quota and xattr blocks */
+ ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);
+
+ return ret;
+}
+
+/*
+ * Calulate the total number of credits to reserve to fit
+ * the modification of a single pages into a single transaction,
+ * which may include multiple chunks of block allocations.
+ *
+ * This could be called via ext4_write_begin()
+ *
+ * We need to consider the worse case, when
+ * one new block per extent.
+ */
int ext4_writepage_trans_blocks(struct inode *inode)
{
int bpp = ext4_journal_blocks_per_page(inode);
- int indirects = (EXT4_NDIR_BLOCKS % bpp) ? 5 : 3;
int ret;
- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
- return ext4_ext_writepage_trans_blocks(inode, bpp);
+ ret = ext4_meta_trans_blocks(inode, bpp, 0);
+ /* Account for data blocks for journalled mode */
if (ext4_should_journal_data(inode))
- ret = 3 * (bpp + indirects) + 2;
- else
- ret = 2 * (bpp + indirects) + 2;
-
-#ifdef CONFIG_QUOTA
- /* We know that structure was already allocated during DQUOT_INIT so
- * we will be updating only the data blocks + inodes */
- ret += 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
-#endif
-
+ ret += bpp;
return ret;
}
/*
+ * Calculate the journal credits for a chunk of data modification.
+ *
+ * This is called from DIO, fallocate or whoever calling
+ * ext4_get_blocks_wrap() to map/allocate a chunk of contigous disk blocks.
+ *
+ * journal buffers for data blocks are not included here, as DIO
+ * and fallocate do no need to journal data buffers.
+ */
+int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
+{
+ return ext4_meta_trans_blocks(inode, nrblocks, 1);
+}
+
+/*
* The caller must have previously called ext4_reserve_inode_write().
* Give this, we know that the caller already has write access to iloc->bh.
*/
@@ -3506,3 +4927,65 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
return err;
}
+
+static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
+{
+ return !buffer_mapped(bh);
+}
+
+int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+ loff_t size;
+ unsigned long len;
+ int ret = -EINVAL;
+ void *fsdata;
+ struct file *file = vma->vm_file;
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct address_space *mapping = inode->i_mapping;
+
+ /*
+ * Get i_alloc_sem to stop truncates messing with the inode. We cannot
+ * get i_mutex because we are already holding mmap_sem.
+ */
+ down_read(&inode->i_alloc_sem);
+ size = i_size_read(inode);
+ if (page->mapping != mapping || size <= page_offset(page)
+ || !PageUptodate(page)) {
+ /* page got truncated from under us? */
+ goto out_unlock;
+ }
+ ret = 0;
+ if (PageMappedToDisk(page))
+ goto out_unlock;
+
+ if (page->index == size >> PAGE_CACHE_SHIFT)
+ len = size & ~PAGE_CACHE_MASK;
+ else
+ len = PAGE_CACHE_SIZE;
+
+ if (page_has_buffers(page)) {
+ /* return if we have all the buffers mapped */
+ if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+ ext4_bh_unmapped))
+ goto out_unlock;
+ }
+ /*
+ * OK, we need to fill the hole... Do write_begin write_end
+ * to do block allocation/reservation.We are not holding
+ * inode.i__mutex here. That allow * parallel write_begin,
+ * write_end call. lock_page prevent this from happening
+ * on the same page though
+ */
+ ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
+ len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
+ if (ret < 0)
+ goto out_unlock;
+ ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
+ len, len, page, fsdata);
+ if (ret < 0)
+ goto out_unlock;
+ ret = 0;
+out_unlock:
+ up_read(&inode->i_alloc_sem);
+ return ret;
+}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 7a6c2f1faba..dc99b4776d5 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -23,9 +23,8 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
struct inode *inode = filp->f_dentry->d_inode;
struct ext4_inode_info *ei = EXT4_I(inode);
unsigned int flags;
- unsigned short rsv_window_size;
- ext4_debug ("cmd = %u, arg = %lu\n", cmd, arg);
+ ext4_debug("cmd = %u, arg = %lu\n", cmd, arg);
switch (cmd) {
case EXT4_IOC_GETFLAGS:
@@ -34,7 +33,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return put_user(flags, (int __user *) arg);
case EXT4_IOC_SETFLAGS: {
handle_t *handle = NULL;
- int err;
+ int err, migrate = 0;
struct ext4_iloc iloc;
unsigned int oldflags;
unsigned int jflag;
@@ -82,6 +81,17 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if (!capable(CAP_SYS_RESOURCE))
goto flags_out;
}
+ if (oldflags & EXT4_EXTENTS_FL) {
+ /* We don't support clearning extent flags */
+ if (!(flags & EXT4_EXTENTS_FL)) {
+ err = -EOPNOTSUPP;
+ goto flags_out;
+ }
+ } else if (flags & EXT4_EXTENTS_FL) {
+ /* migrate the file */
+ migrate = 1;
+ flags &= ~EXT4_EXTENTS_FL;
+ }
handle = ext4_journal_start(inode, 1);
if (IS_ERR(handle)) {
@@ -109,6 +119,10 @@ flags_err:
if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL))
err = ext4_change_inode_journal_flag(inode, jflag);
+ if (err)
+ goto flags_out;
+ if (migrate)
+ err = ext4_ext_migrate(inode);
flags_out:
mutex_unlock(&inode->i_mutex);
mnt_drop_write(filp->f_path.mnt);
@@ -175,53 +189,10 @@ setversion_out:
return ret;
}
#endif
- case EXT4_IOC_GETRSVSZ:
- if (test_opt(inode->i_sb, RESERVATION)
- && S_ISREG(inode->i_mode)
- && ei->i_block_alloc_info) {
- rsv_window_size = ei->i_block_alloc_info->rsv_window_node.rsv_goal_size;
- return put_user(rsv_window_size, (int __user *)arg);
- }
- return -ENOTTY;
- case EXT4_IOC_SETRSVSZ: {
- int err;
-
- if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
- return -ENOTTY;
-
- if (!is_owner_or_cap(inode))
- return -EACCES;
-
- if (get_user(rsv_window_size, (int __user *)arg))
- return -EFAULT;
-
- err = mnt_want_write(filp->f_path.mnt);
- if (err)
- return err;
-
- if (rsv_window_size > EXT4_MAX_RESERVE_BLOCKS)
- rsv_window_size = EXT4_MAX_RESERVE_BLOCKS;
-
- /*
- * need to allocate reservation structure for this inode
- * before set the window size
- */
- down_write(&ei->i_data_sem);
- if (!ei->i_block_alloc_info)
- ext4_init_block_alloc_info(inode);
-
- if (ei->i_block_alloc_info){
- struct ext4_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node;
- rsv->rsv_goal_size = rsv_window_size;
- }
- up_write(&ei->i_data_sem);
- mnt_drop_write(filp->f_path.mnt);
- return 0;
- }
case EXT4_IOC_GROUP_EXTEND: {
ext4_fsblk_t n_blocks_count;
struct super_block *sb = inode->i_sb;
- int err;
+ int err, err2;
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
@@ -235,8 +206,10 @@ setversion_out:
err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
- jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+ err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+ if (err == 0)
+ err = err2;
mnt_drop_write(filp->f_path.mnt);
return err;
@@ -244,7 +217,7 @@ setversion_out:
case EXT4_IOC_GROUP_ADD: {
struct ext4_new_group_data input;
struct super_block *sb = inode->i_sb;
- int err;
+ int err, err2;
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
@@ -259,15 +232,36 @@ setversion_out:
err = ext4_group_add(sb, &input);
jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
- jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+ err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+ if (err == 0)
+ err = err2;
mnt_drop_write(filp->f_path.mnt);
return err;
}
case EXT4_IOC_MIGRATE:
- return ext4_ext_migrate(inode, filp, cmd, arg);
+ {
+ int err;
+ if (!is_owner_or_cap(inode))
+ return -EACCES;
+
+ err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+ return err;
+ /*
+ * inode_mutex prevent write and truncate on the file.
+ * Read still goes through. We take i_data_sem in
+ * ext4_ext_swap_inode_data before we switch the
+ * inode format to prevent read.
+ */
+ mutex_lock(&(inode->i_mutex));
+ err = ext4_ext_migrate(inode);
+ mutex_unlock(&(inode->i_mutex));
+ mnt_drop_write(filp->f_path.mnt);
+ return err;
+ }
default:
return -ENOTTY;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c9900aade15..b580714f0d8 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -381,22 +381,28 @@ static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
static inline int mb_find_next_zero_bit(void *addr, int max, int start)
{
- int fix = 0;
+ int fix = 0, ret, tmpmax;
addr = mb_correct_addr_and_bit(&fix, addr);
- max += fix;
+ tmpmax = max + fix;
start += fix;
- return ext4_find_next_zero_bit(addr, max, start) - fix;
+ ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
+ if (ret > max)
+ return max;
+ return ret;
}
static inline int mb_find_next_bit(void *addr, int max, int start)
{
- int fix = 0;
+ int fix = 0, ret, tmpmax;
addr = mb_correct_addr_and_bit(&fix, addr);
- max += fix;
+ tmpmax = max + fix;
start += fix;
- return ext4_find_next_bit(addr, max, start) - fix;
+ ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
+ if (ret > max)
+ return max;
+ return ret;
}
static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
@@ -471,9 +477,10 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
b2 = (unsigned char *) bitmap;
for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
if (b1[i] != b2[i]) {
- printk("corruption in group %lu at byte %u(%u):"
- " %x in copy != %x on disk/prealloc\n",
- e4b->bd_group, i, i * 8, b1[i], b2[i]);
+ printk(KERN_ERR "corruption in group %lu "
+ "at byte %u(%u): %x in copy != %x "
+ "on disk/prealloc\n",
+ e4b->bd_group, i, i * 8, b1[i], b2[i]);
BUG();
}
}
@@ -527,9 +534,6 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
void *buddy;
void *buddy2;
- if (!test_opt(sb, MBALLOC))
- return 0;
-
{
static int mb_check_counter;
if (mb_check_counter++ % 100 != 0)
@@ -778,16 +782,21 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
if (bh[i] == NULL)
goto out;
- if (bh_uptodate_or_lock(bh[i]))
+ if (buffer_uptodate(bh[i]) &&
+ !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
continue;
+ lock_buffer(bh[i]);
+ spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
ext4_init_block_bitmap(sb, bh[i],
first_group + i, desc);
set_buffer_uptodate(bh[i]);
unlock_buffer(bh[i]);
+ spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
continue;
}
+ spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
get_bh(bh[i]);
bh[i]->b_end_io = end_buffer_read_sync;
submit_bh(READ, bh[i]);
@@ -803,6 +812,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
if (!buffer_uptodate(bh[i]))
goto out;
+ err = 0;
first_block = page->index * blocks_per_page;
for (i = 0; i < blocks_per_page; i++) {
int group;
@@ -883,6 +893,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
int pnum;
int poff;
struct page *page;
+ int ret;
mb_debug("load group %lu\n", group);
@@ -914,15 +925,21 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
if (page) {
BUG_ON(page->mapping != inode->i_mapping);
if (!PageUptodate(page)) {
- ext4_mb_init_cache(page, NULL);
+ ret = ext4_mb_init_cache(page, NULL);
+ if (ret) {
+ unlock_page(page);
+ goto err;
+ }
mb_cmp_bitmaps(e4b, page_address(page) +
(poff * sb->s_blocksize));
}
unlock_page(page);
}
}
- if (page == NULL || !PageUptodate(page))
+ if (page == NULL || !PageUptodate(page)) {
+ ret = -EIO;
goto err;
+ }
e4b->bd_bitmap_page = page;
e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
mark_page_accessed(page);
@@ -938,14 +955,20 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
if (page) {
BUG_ON(page->mapping != inode->i_mapping);
- if (!PageUptodate(page))
- ext4_mb_init_cache(page, e4b->bd_bitmap);
-
+ if (!PageUptodate(page)) {
+ ret = ext4_mb_init_cache(page, e4b->bd_bitmap);
+ if (ret) {
+ unlock_page(page);
+ goto err;
+ }
+ }
unlock_page(page);
}
}
- if (page == NULL || !PageUptodate(page))
+ if (page == NULL || !PageUptodate(page)) {
+ ret = -EIO;
goto err;
+ }
e4b->bd_buddy_page = page;
e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
mark_page_accessed(page);
@@ -962,7 +985,7 @@ err:
page_cache_release(e4b->bd_buddy_page);
e4b->bd_buddy = NULL;
e4b->bd_bitmap = NULL;
- return -EIO;
+ return ret;
}
static void ext4_mb_release_desc(struct ext4_buddy *e4b)
@@ -1031,7 +1054,7 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
}
}
-static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
+static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
int first, int count)
{
int block = 0;
@@ -1071,11 +1094,12 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
blocknr += block;
blocknr +=
le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-
+ ext4_unlock_group(sb, e4b->bd_group);
ext4_error(sb, __func__, "double-free of inode"
" %lu's block %llu(bit %u in group %lu)\n",
inode ? inode->i_ino : 0, blocknr, block,
e4b->bd_group);
+ ext4_lock_group(sb, e4b->bd_group);
}
mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
e4b->bd_info->bb_counters[order]++;
@@ -1113,8 +1137,6 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
} while (1);
}
mb_check_buddy(e4b);
-
- return 0;
}
static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
@@ -1730,10 +1752,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
spin_unlock(&sbi->s_md_lock);
}
-
- /* searching for the right group start from the goal value specified */
- group = ac->ac_g_ex.fe_group;
-
/* Let's just scan groups to find more-less suitable blocks */
cr = ac->ac_2order ? 0 : 1;
/*
@@ -1743,6 +1761,12 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
repeat:
for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
ac->ac_criteria = cr;
+ /*
+ * searching for the right group start
+ * from the goal value specified
+ */
+ group = ac->ac_g_ex.fe_group;
+
for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
struct ext4_group_info *grp;
struct ext4_group_desc *desc;
@@ -1963,6 +1987,8 @@ static int ext4_mb_seq_history_open(struct inode *inode, struct file *file)
int rc;
int size;
+ if (unlikely(sbi->s_mb_history == NULL))
+ return -ENOMEM;
s = kmalloc(sizeof(*s), GFP_KERNEL);
if (s == NULL)
return -ENOMEM;
@@ -2143,9 +2169,10 @@ static void ext4_mb_history_release(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- remove_proc_entry("mb_groups", sbi->s_mb_proc);
- remove_proc_entry("mb_history", sbi->s_mb_proc);
-
+ if (sbi->s_proc != NULL) {
+ remove_proc_entry("mb_groups", sbi->s_proc);
+ remove_proc_entry("mb_history", sbi->s_proc);
+ }
kfree(sbi->s_mb_history);
}
@@ -2154,10 +2181,10 @@ static void ext4_mb_history_init(struct super_block *sb)
struct ext4_sb_info *sbi = EXT4_SB(sb);
int i;
- if (sbi->s_mb_proc != NULL) {
- proc_create_data("mb_history", S_IRUGO, sbi->s_mb_proc,
+ if (sbi->s_proc != NULL) {
+ proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
&ext4_mb_seq_history_fops, sb);
- proc_create_data("mb_groups", S_IRUGO, sbi->s_mb_proc,
+ proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
&ext4_mb_seq_groups_fops, sb);
}
@@ -2165,9 +2192,7 @@ static void ext4_mb_history_init(struct super_block *sb)
sbi->s_mb_history_cur = 0;
spin_lock_init(&sbi->s_mb_history_lock);
i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
- sbi->s_mb_history = kmalloc(i, GFP_KERNEL);
- if (likely(sbi->s_mb_history != NULL))
- memset(sbi->s_mb_history, 0, i);
+ sbi->s_mb_history = kzalloc(i, GFP_KERNEL);
/* if we can't allocate history, then we simple won't use it */
}
@@ -2215,21 +2240,192 @@ ext4_mb_store_history(struct ext4_allocation_context *ac)
#define ext4_mb_history_init(sb)
#endif
+
+/* Create and initialize ext4_group_info data for the given group. */
+int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
+ struct ext4_group_desc *desc)
+{
+ int i, len;
+ int metalen = 0;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_info **meta_group_info;
+
+ /*
+ * First check if this group is the first of a reserved block.
+ * If it's true, we have to allocate a new table of pointers
+ * to ext4_group_info structures
+ */
+ if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
+ metalen = sizeof(*meta_group_info) <<
+ EXT4_DESC_PER_BLOCK_BITS(sb);
+ meta_group_info = kmalloc(metalen, GFP_KERNEL);
+ if (meta_group_info == NULL) {
+ printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
+ "buddy group\n");
+ goto exit_meta_group_info;
+ }
+ sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
+ meta_group_info;
+ }
+
+ /*
+ * calculate needed size. if change bb_counters size,
+ * don't forget about ext4_mb_generate_buddy()
+ */
+ len = offsetof(typeof(**meta_group_info),
+ bb_counters[sb->s_blocksize_bits + 2]);
+
+ meta_group_info =
+ sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
+ i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
+
+ meta_group_info[i] = kzalloc(len, GFP_KERNEL);
+ if (meta_group_info[i] == NULL) {
+ printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
+ goto exit_group_info;
+ }
+ set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
+ &(meta_group_info[i]->bb_state));
+
+ /*
+ * initialize bb_free to be able to skip
+ * empty groups without initialization
+ */
+ if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+ meta_group_info[i]->bb_free =
+ ext4_free_blocks_after_init(sb, group, desc);
+ } else {
+ meta_group_info[i]->bb_free =
+ le16_to_cpu(desc->bg_free_blocks_count);
+ }
+
+ INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
+
+#ifdef DOUBLE_CHECK
+ {
+ struct buffer_head *bh;
+ meta_group_info[i]->bb_bitmap =
+ kmalloc(sb->s_blocksize, GFP_KERNEL);
+ BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
+ bh = ext4_read_block_bitmap(sb, group);
+ BUG_ON(bh == NULL);
+ memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
+ sb->s_blocksize);
+ put_bh(bh);
+ }
+#endif
+
+ return 0;
+
+exit_group_info:
+ /* If a meta_group_info table has been allocated, release it now */
+ if (group % EXT4_DESC_PER_BLOCK(sb) == 0)
+ kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
+exit_meta_group_info:
+ return -ENOMEM;
+} /* ext4_mb_add_groupinfo */
+
+/*
+ * Add a group to the existing groups.
+ * This function is used for online resize
+ */
+int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group,
+ struct ext4_group_desc *desc)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct inode *inode = sbi->s_buddy_cache;
+ int blocks_per_page;
+ int block;
+ int pnum;
+ struct page *page;
+ int err;
+
+ /* Add group based on group descriptor*/
+ err = ext4_mb_add_groupinfo(sb, group, desc);
+ if (err)
+ return err;
+
+ /*
+ * Cache pages containing dynamic mb_alloc datas (buddy and bitmap
+ * datas) are set not up to date so that they will be re-initilaized
+ * during the next call to ext4_mb_load_buddy
+ */
+
+ /* Set buddy page as not up to date */
+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+ block = group * 2;
+ pnum = block / blocks_per_page;
+ page = find_get_page(inode->i_mapping, pnum);
+ if (page != NULL) {
+ ClearPageUptodate(page);
+ page_cache_release(page);
+ }
+
+ /* Set bitmap page as not up to date */
+ block++;
+ pnum = block / blocks_per_page;
+ page = find_get_page(inode->i_mapping, pnum);
+ if (page != NULL) {
+ ClearPageUptodate(page);
+ page_cache_release(page);
+ }
+
+ return 0;
+}
+
+/*
+ * Update an existing group.
+ * This function is used for online resize
+ */
+void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
+{
+ grp->bb_free += add;
+}
+
static int ext4_mb_init_backend(struct super_block *sb)
{
ext4_group_t i;
- int j, len, metalen;
+ int metalen;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- int num_meta_group_infos =
- (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) >>
- EXT4_DESC_PER_BLOCK_BITS(sb);
+ struct ext4_super_block *es = sbi->s_es;
+ int num_meta_group_infos;
+ int num_meta_group_infos_max;
+ int array_size;
struct ext4_group_info **meta_group_info;
+ struct ext4_group_desc *desc;
+ /* This is the number of blocks used by GDT */
+ num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) -
+ 1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
+
+ /*
+ * This is the total number of blocks used by GDT including
+ * the number of reserved blocks for GDT.
+ * The s_group_info array is allocated with this value
+ * to allow a clean online resize without a complex
+ * manipulation of pointer.
+ * The drawback is the unused memory when no resize
+ * occurs but it's very low in terms of pages
+ * (see comments below)
+ * Need to handle this properly when META_BG resizing is allowed
+ */
+ num_meta_group_infos_max = num_meta_group_infos +
+ le16_to_cpu(es->s_reserved_gdt_blocks);
+
+ /*
+ * array_size is the size of s_group_info array. We round it
+ * to the next power of two because this approximation is done
+ * internally by kmalloc so we can have some more memory
+ * for free here (e.g. may be used for META_BG resize).
+ */
+ array_size = 1;
+ while (array_size < sizeof(*sbi->s_group_info) *
+ num_meta_group_infos_max)
+ array_size = array_size << 1;
/* An 8TB filesystem with 64-bit pointers requires a 4096 byte
* kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
* So a two level scheme suffices for now. */
- sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) *
- num_meta_group_infos, GFP_KERNEL);
+ sbi->s_group_info = kmalloc(array_size, GFP_KERNEL);
if (sbi->s_group_info == NULL) {
printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
return -ENOMEM;
@@ -2256,63 +2452,15 @@ static int ext4_mb_init_backend(struct super_block *sb)
sbi->s_group_info[i] = meta_group_info;
}
- /*
- * calculate needed size. if change bb_counters size,
- * don't forget about ext4_mb_generate_buddy()
- */
- len = sizeof(struct ext4_group_info);
- len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2);
for (i = 0; i < sbi->s_groups_count; i++) {
- struct ext4_group_desc *desc;
-
- meta_group_info =
- sbi->s_group_info[i >> EXT4_DESC_PER_BLOCK_BITS(sb)];
- j = i & (EXT4_DESC_PER_BLOCK(sb) - 1);
-
- meta_group_info[j] = kzalloc(len, GFP_KERNEL);
- if (meta_group_info[j] == NULL) {
- printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
- goto err_freebuddy;
- }
desc = ext4_get_group_desc(sb, i, NULL);
if (desc == NULL) {
printk(KERN_ERR
"EXT4-fs: can't read descriptor %lu\n", i);
- i++;
goto err_freebuddy;
}
- memset(meta_group_info[j], 0, len);
- set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
- &(meta_group_info[j]->bb_state));
-
- /*
- * initialize bb_free to be able to skip
- * empty groups without initialization
- */
- if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
- meta_group_info[j]->bb_free =
- ext4_free_blocks_after_init(sb, i, desc);
- } else {
- meta_group_info[j]->bb_free =
- le16_to_cpu(desc->bg_free_blocks_count);
- }
-
- INIT_LIST_HEAD(&meta_group_info[j]->bb_prealloc_list);
-
-#ifdef DOUBLE_CHECK
- {
- struct buffer_head *bh;
- meta_group_info[j]->bb_bitmap =
- kmalloc(sb->s_blocksize, GFP_KERNEL);
- BUG_ON(meta_group_info[j]->bb_bitmap == NULL);
- bh = read_block_bitmap(sb, i);
- BUG_ON(bh == NULL);
- memcpy(meta_group_info[j]->bb_bitmap, bh->b_data,
- sb->s_blocksize);
- put_bh(bh);
- }
-#endif
-
+ if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
+ goto err_freebuddy;
}
return 0;
@@ -2333,23 +2481,19 @@ err_freesgi:
int ext4_mb_init(struct super_block *sb, int needs_recovery)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- unsigned i;
+ unsigned i, j;
unsigned offset;
unsigned max;
-
- if (!test_opt(sb, MBALLOC))
- return 0;
+ int ret;
i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short);
sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
if (sbi->s_mb_offsets == NULL) {
- clear_opt(sbi->s_mount_opt, MBALLOC);
return -ENOMEM;
}
sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
if (sbi->s_mb_maxs == NULL) {
- clear_opt(sbi->s_mount_opt, MBALLOC);
kfree(sbi->s_mb_maxs);
return -ENOMEM;
}
@@ -2370,12 +2514,11 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
} while (i <= sb->s_blocksize_bits + 1);
/* init file for buddy data */
- i = ext4_mb_init_backend(sb);
- if (i) {
- clear_opt(sbi->s_mount_opt, MBALLOC);
+ ret = ext4_mb_init_backend(sb);
+ if (ret != 0) {
kfree(sbi->s_mb_offsets);
kfree(sbi->s_mb_maxs);
- return i;
+ return ret;
}
spin_lock_init(&sbi->s_md_lock);
@@ -2392,26 +2535,25 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
- i = sizeof(struct ext4_locality_group) * NR_CPUS;
- sbi->s_locality_groups = kmalloc(i, GFP_KERNEL);
+ sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
if (sbi->s_locality_groups == NULL) {
- clear_opt(sbi->s_mount_opt, MBALLOC);
kfree(sbi->s_mb_offsets);
kfree(sbi->s_mb_maxs);
return -ENOMEM;
}
- for (i = 0; i < NR_CPUS; i++) {
+ for_each_possible_cpu(i) {
struct ext4_locality_group *lg;
- lg = &sbi->s_locality_groups[i];
+ lg = per_cpu_ptr(sbi->s_locality_groups, i);
mutex_init(&lg->lg_mutex);
- INIT_LIST_HEAD(&lg->lg_prealloc_list);
+ for (j = 0; j < PREALLOC_TB_SIZE; j++)
+ INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
spin_lock_init(&lg->lg_prealloc_lock);
}
ext4_mb_init_per_dev_proc(sb);
ext4_mb_history_init(sb);
- printk("EXT4-fs: mballoc enabled\n");
+ printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
return 0;
}
@@ -2440,9 +2582,6 @@ int ext4_mb_release(struct super_block *sb)
struct ext4_group_info *grinfo;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (!test_opt(sb, MBALLOC))
- return 0;
-
/* release freed, non-committed blocks */
spin_lock(&sbi->s_md_lock);
list_splice_init(&sbi->s_closed_transaction,
@@ -2498,8 +2637,7 @@ int ext4_mb_release(struct super_block *sb)
atomic_read(&sbi->s_mb_discarded));
}
- kfree(sbi->s_locality_groups);
-
+ free_percpu(sbi->s_locality_groups);
ext4_mb_history_release(sb);
ext4_mb_destroy_per_dev_proc(sb);
@@ -2548,8 +2686,7 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
ext4_lock_group(sb, md->group);
for (i = 0; i < md->num; i++) {
mb_debug(" %u", md->blocks[i]);
- err = mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
- BUG_ON(err != 0);
+ mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
}
mb_debug("\n");
ext4_unlock_group(sb, md->group);
@@ -2573,114 +2710,46 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
#define EXT4_MB_STREAM_REQ "stream_req"
#define EXT4_MB_GROUP_PREALLOC "group_prealloc"
-
-
-#define MB_PROC_VALUE_READ(name) \
-static int ext4_mb_read_##name(char *page, char **start, \
- off_t off, int count, int *eof, void *data) \
-{ \
- struct ext4_sb_info *sbi = data; \
- int len; \
- *eof = 1; \
- if (off != 0) \
- return 0; \
- len = sprintf(page, "%ld\n", sbi->s_mb_##name); \
- *start = page; \
- return len; \
-}
-
-#define MB_PROC_VALUE_WRITE(name) \
-static int ext4_mb_write_##name(struct file *file, \
- const char __user *buf, unsigned long cnt, void *data) \
-{ \
- struct ext4_sb_info *sbi = data; \
- char str[32]; \
- long value; \
- if (cnt >= sizeof(str)) \
- return -EINVAL; \
- if (copy_from_user(str, buf, cnt)) \
- return -EFAULT; \
- value = simple_strtol(str, NULL, 0); \
- if (value <= 0) \
- return -ERANGE; \
- sbi->s_mb_##name = value; \
- return cnt; \
-}
-
-MB_PROC_VALUE_READ(stats);
-MB_PROC_VALUE_WRITE(stats);
-MB_PROC_VALUE_READ(max_to_scan);
-MB_PROC_VALUE_WRITE(max_to_scan);
-MB_PROC_VALUE_READ(min_to_scan);
-MB_PROC_VALUE_WRITE(min_to_scan);
-MB_PROC_VALUE_READ(order2_reqs);
-MB_PROC_VALUE_WRITE(order2_reqs);
-MB_PROC_VALUE_READ(stream_request);
-MB_PROC_VALUE_WRITE(stream_request);
-MB_PROC_VALUE_READ(group_prealloc);
-MB_PROC_VALUE_WRITE(group_prealloc);
-
-#define MB_PROC_HANDLER(name, var) \
-do { \
- proc = create_proc_entry(name, mode, sbi->s_mb_proc); \
- if (proc == NULL) { \
- printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \
- goto err_out; \
- } \
- proc->data = sbi; \
- proc->read_proc = ext4_mb_read_##var ; \
- proc->write_proc = ext4_mb_write_##var; \
-} while (0)
-
static int ext4_mb_init_per_dev_proc(struct super_block *sb)
{
mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct proc_dir_entry *proc;
- char devname[64];
- bdevname(sb->s_bdev, devname);
- sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4);
-
- MB_PROC_HANDLER(EXT4_MB_STATS_NAME, stats);
- MB_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, max_to_scan);
- MB_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, min_to_scan);
- MB_PROC_HANDLER(EXT4_MB_ORDER2_REQ, order2_reqs);
- MB_PROC_HANDLER(EXT4_MB_STREAM_REQ, stream_request);
- MB_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, group_prealloc);
+ if (sbi->s_proc == NULL)
+ return -EINVAL;
+ EXT4_PROC_HANDLER(EXT4_MB_STATS_NAME, mb_stats);
+ EXT4_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, mb_max_to_scan);
+ EXT4_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, mb_min_to_scan);
+ EXT4_PROC_HANDLER(EXT4_MB_ORDER2_REQ, mb_order2_reqs);
+ EXT4_PROC_HANDLER(EXT4_MB_STREAM_REQ, mb_stream_request);
+ EXT4_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, mb_group_prealloc);
return 0;
err_out:
- printk(KERN_ERR "EXT4-fs: Unable to create %s\n", devname);
- remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc);
- remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc);
- remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc);
- remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc);
- remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc);
- remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc);
- remove_proc_entry(devname, proc_root_ext4);
- sbi->s_mb_proc = NULL;
-
+ remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
+ remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
+ remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
+ remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
+ remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
+ remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
return -ENOMEM;
}
static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- char devname[64];
- if (sbi->s_mb_proc == NULL)
+ if (sbi->s_proc == NULL)
return -EINVAL;
- bdevname(sb->s_bdev, devname);
- remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc);
- remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc);
- remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc);
- remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc);
- remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc);
- remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc);
- remove_proc_entry(devname, proc_root_ext4);
+ remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
+ remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
+ remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
+ remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
+ remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
+ remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
return 0;
}
@@ -2702,11 +2771,6 @@ int __init init_ext4_mballoc(void)
kmem_cache_destroy(ext4_pspace_cachep);
return -ENOMEM;
}
-#ifdef CONFIG_PROC_FS
- proc_root_ext4 = proc_mkdir("fs/ext4", NULL);
- if (proc_root_ext4 == NULL)
- printk(KERN_ERR "EXT4-fs: Unable to create fs/ext4\n");
-#endif
return 0;
}
@@ -2715,9 +2779,6 @@ void exit_ext4_mballoc(void)
/* XXX: synchronize_rcu(); */
kmem_cache_destroy(ext4_pspace_cachep);
kmem_cache_destroy(ext4_ac_cachep);
-#ifdef CONFIG_PROC_FS
- remove_proc_entry("fs/ext4", NULL);
-#endif
}
@@ -2727,7 +2788,7 @@ void exit_ext4_mballoc(void)
*/
static noinline_for_stack int
ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
- handle_t *handle)
+ handle_t *handle, unsigned long reserv_blks)
{
struct buffer_head *bitmap_bh = NULL;
struct ext4_super_block *es;
@@ -2747,7 +2808,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
err = -EIO;
- bitmap_bh = read_block_bitmap(sb, ac->ac_b_ex.fe_group);
+ bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
if (!bitmap_bh)
goto out_err;
@@ -2817,6 +2878,23 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
+ /*
+ * Now reduce the dirty block count also. Should not go negative
+ */
+ if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
+ /* release all the reserved blocks if non delalloc */
+ percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks);
+ else
+ percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+ ac->ac_b_ex.fe_len);
+
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi,
+ ac->ac_b_ex.fe_group);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
err = ext4_journal_dirty_metadata(handle, bitmap_bh);
if (err)
@@ -3096,6 +3174,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
struct ext4_prealloc_space *pa)
{
unsigned int len = ac->ac_o_ex.fe_len;
+
ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
&ac->ac_b_ex.fe_group,
&ac->ac_b_ex.fe_start);
@@ -3113,14 +3192,45 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
}
/*
+ * Return the prealloc space that have minimal distance
+ * from the goal block. @cpa is the prealloc
+ * space that is having currently known minimal distance
+ * from the goal block.
+ */
+static struct ext4_prealloc_space *
+ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
+ struct ext4_prealloc_space *pa,
+ struct ext4_prealloc_space *cpa)
+{
+ ext4_fsblk_t cur_distance, new_distance;
+
+ if (cpa == NULL) {
+ atomic_inc(&pa->pa_count);
+ return pa;
+ }
+ cur_distance = abs(goal_block - cpa->pa_pstart);
+ new_distance = abs(goal_block - pa->pa_pstart);
+
+ if (cur_distance < new_distance)
+ return cpa;
+
+ /* drop the previous reference */
+ atomic_dec(&cpa->pa_count);
+ atomic_inc(&pa->pa_count);
+ return pa;
+}
+
+/*
* search goal blocks in preallocated space
*/
static noinline_for_stack int
ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
{
+ int order, i;
struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
struct ext4_locality_group *lg;
- struct ext4_prealloc_space *pa;
+ struct ext4_prealloc_space *pa, *cpa = NULL;
+ ext4_fsblk_t goal_block;
/* only data can be preallocated */
if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
@@ -3158,22 +3268,38 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
lg = ac->ac_lg;
if (lg == NULL)
return 0;
+ order = fls(ac->ac_o_ex.fe_len) - 1;
+ if (order > PREALLOC_TB_SIZE - 1)
+ /* The max size of hash table is PREALLOC_TB_SIZE */
+ order = PREALLOC_TB_SIZE - 1;
+
+ goal_block = ac->ac_g_ex.fe_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb) +
+ ac->ac_g_ex.fe_start +
+ le32_to_cpu(EXT4_SB(ac->ac_sb)->s_es->s_first_data_block);
+ /*
+ * search for the prealloc space that is having
+ * minimal distance from the goal block.
+ */
+ for (i = order; i < PREALLOC_TB_SIZE; i++) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
+ pa_inode_list) {
+ spin_lock(&pa->pa_lock);
+ if (pa->pa_deleted == 0 &&
+ pa->pa_free >= ac->ac_o_ex.fe_len) {
- rcu_read_lock();
- list_for_each_entry_rcu(pa, &lg->lg_prealloc_list, pa_inode_list) {
- spin_lock(&pa->pa_lock);
- if (pa->pa_deleted == 0 && pa->pa_free >= ac->ac_o_ex.fe_len) {
- atomic_inc(&pa->pa_count);
- ext4_mb_use_group_pa(ac, pa);
+ cpa = ext4_mb_check_group_pa(goal_block,
+ pa, cpa);
+ }
spin_unlock(&pa->pa_lock);
- ac->ac_criteria = 20;
- rcu_read_unlock();
- return 1;
}
- spin_unlock(&pa->pa_lock);
+ rcu_read_unlock();
+ }
+ if (cpa) {
+ ext4_mb_use_group_pa(ac, cpa);
+ ac->ac_criteria = 20;
+ return 1;
}
- rcu_read_unlock();
-
return 0;
}
@@ -3396,6 +3522,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
pa->pa_free = pa->pa_len;
atomic_set(&pa->pa_count, 1);
spin_lock_init(&pa->pa_lock);
+ INIT_LIST_HEAD(&pa->pa_inode_list);
pa->pa_deleted = 0;
pa->pa_linear = 1;
@@ -3416,10 +3543,10 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
- spin_lock(pa->pa_obj_lock);
- list_add_tail_rcu(&pa->pa_inode_list, &lg->lg_prealloc_list);
- spin_unlock(pa->pa_obj_lock);
-
+ /*
+ * We will later add the new pa to the right bucket
+ * after updating the pa_free in ext4_mb_release_context
+ */
return 0;
}
@@ -3473,8 +3600,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
if (bit >= end)
break;
next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
- if (next > end)
- next = end;
start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
le32_to_cpu(sbi->s_es->s_first_data_block);
mb_debug(" free preallocated %u/%u in group %u\n",
@@ -3569,22 +3694,25 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
if (list_empty(&grp->bb_prealloc_list))
return 0;
- bitmap_bh = read_block_bitmap(sb, group);
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
if (bitmap_bh == NULL) {
- /* error handling here */
- ext4_mb_release_desc(&e4b);
- BUG_ON(bitmap_bh == NULL);
+ ext4_error(sb, __func__, "Error in reading block "
+ "bitmap for %lu\n", group);
+ return 0;
}
err = ext4_mb_load_buddy(sb, group, &e4b);
- BUG_ON(err != 0); /* error handling here */
+ if (err) {
+ ext4_error(sb, __func__, "Error in loading buddy "
+ "information for %lu\n", group);
+ put_bh(bitmap_bh);
+ return 0;
+ }
if (needed == 0)
needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
- grp = ext4_get_group_info(sb, group);
INIT_LIST_HEAD(&list);
-
ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
repeat:
ext4_lock_group(sb, group);
@@ -3666,7 +3794,7 @@ out:
*
* FIXME!! Make sure it is valid at all the call sites
*/
-void ext4_mb_discard_inode_preallocations(struct inode *inode)
+void ext4_discard_preallocations(struct inode *inode)
{
struct ext4_inode_info *ei = EXT4_I(inode);
struct super_block *sb = inode->i_sb;
@@ -3678,7 +3806,7 @@ void ext4_mb_discard_inode_preallocations(struct inode *inode)
struct ext4_buddy e4b;
int err;
- if (!test_opt(sb, MBALLOC) || !S_ISREG(inode->i_mode)) {
+ if (!S_ISREG(inode->i_mode)) {
/*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
return;
}
@@ -3741,13 +3869,18 @@ repeat:
ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
err = ext4_mb_load_buddy(sb, group, &e4b);
- BUG_ON(err != 0); /* error handling here */
+ if (err) {
+ ext4_error(sb, __func__, "Error in loading buddy "
+ "information for %lu\n", group);
+ continue;
+ }
- bitmap_bh = read_block_bitmap(sb, group);
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
if (bitmap_bh == NULL) {
- /* error handling here */
+ ext4_error(sb, __func__, "Error in reading block "
+ "bitmap for %lu\n", group);
ext4_mb_release_desc(&e4b);
- BUG_ON(bitmap_bh == NULL);
+ continue;
}
ext4_lock_group(sb, group);
@@ -3871,8 +4004,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
* per cpu locality group is to reduce the contention between block
* request from multiple CPUs.
*/
- ac->ac_lg = &sbi->s_locality_groups[get_cpu()];
- put_cpu();
+ ac->ac_lg = per_cpu_ptr(sbi->s_locality_groups, raw_smp_processor_id());
/* we're going to use group allocation */
ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
@@ -3950,22 +4082,168 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
}
+static noinline_for_stack void
+ext4_mb_discard_lg_preallocations(struct super_block *sb,
+ struct ext4_locality_group *lg,
+ int order, int total_entries)
+{
+ ext4_group_t group = 0;
+ struct ext4_buddy e4b;
+ struct list_head discard_list;
+ struct ext4_prealloc_space *pa, *tmp;
+ struct ext4_allocation_context *ac;
+
+ mb_debug("discard locality group preallocation\n");
+
+ INIT_LIST_HEAD(&discard_list);
+ ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
+
+ spin_lock(&lg->lg_prealloc_lock);
+ list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
+ pa_inode_list) {
+ spin_lock(&pa->pa_lock);
+ if (atomic_read(&pa->pa_count)) {
+ /*
+ * This is the pa that we just used
+ * for block allocation. So don't
+ * free that
+ */
+ spin_unlock(&pa->pa_lock);
+ continue;
+ }
+ if (pa->pa_deleted) {
+ spin_unlock(&pa->pa_lock);
+ continue;
+ }
+ /* only lg prealloc space */
+ BUG_ON(!pa->pa_linear);
+
+ /* seems this one can be freed ... */
+ pa->pa_deleted = 1;
+ spin_unlock(&pa->pa_lock);
+
+ list_del_rcu(&pa->pa_inode_list);
+ list_add(&pa->u.pa_tmp_list, &discard_list);
+
+ total_entries--;
+ if (total_entries <= 5) {
+ /*
+ * we want to keep only 5 entries
+ * allowing it to grow to 8. This
+ * mak sure we don't call discard
+ * soon for this list.
+ */
+ break;
+ }
+ }
+ spin_unlock(&lg->lg_prealloc_lock);
+
+ list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
+
+ ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
+ if (ext4_mb_load_buddy(sb, group, &e4b)) {
+ ext4_error(sb, __func__, "Error in loading buddy "
+ "information for %lu\n", group);
+ continue;
+ }
+ ext4_lock_group(sb, group);
+ list_del(&pa->pa_group_list);
+ ext4_mb_release_group_pa(&e4b, pa, ac);
+ ext4_unlock_group(sb, group);
+
+ ext4_mb_release_desc(&e4b);
+ list_del(&pa->u.pa_tmp_list);
+ call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
+ }
+ if (ac)
+ kmem_cache_free(ext4_ac_cachep, ac);
+}
+
+/*
+ * We have incremented pa_count. So it cannot be freed at this
+ * point. Also we hold lg_mutex. So no parallel allocation is
+ * possible from this lg. That means pa_free cannot be updated.
+ *
+ * A parallel ext4_mb_discard_group_preallocations is possible.
+ * which can cause the lg_prealloc_list to be updated.
+ */
+
+static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
+{
+ int order, added = 0, lg_prealloc_count = 1;
+ struct super_block *sb = ac->ac_sb;
+ struct ext4_locality_group *lg = ac->ac_lg;
+ struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;
+
+ order = fls(pa->pa_free) - 1;
+ if (order > PREALLOC_TB_SIZE - 1)
+ /* The max size of hash table is PREALLOC_TB_SIZE */
+ order = PREALLOC_TB_SIZE - 1;
+ /* Add the prealloc space to lg */
+ rcu_read_lock();
+ list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
+ pa_inode_list) {
+ spin_lock(&tmp_pa->pa_lock);
+ if (tmp_pa->pa_deleted) {
+ spin_unlock(&pa->pa_lock);
+ continue;
+ }
+ if (!added && pa->pa_free < tmp_pa->pa_free) {
+ /* Add to the tail of the previous entry */
+ list_add_tail_rcu(&pa->pa_inode_list,
+ &tmp_pa->pa_inode_list);
+ added = 1;
+ /*
+ * we want to count the total
+ * number of entries in the list
+ */
+ }
+ spin_unlock(&tmp_pa->pa_lock);
+ lg_prealloc_count++;
+ }
+ if (!added)
+ list_add_tail_rcu(&pa->pa_inode_list,
+ &lg->lg_prealloc_list[order]);
+ rcu_read_unlock();
+
+ /* Now trim the list to be not more than 8 elements */
+ if (lg_prealloc_count > 8) {
+ ext4_mb_discard_lg_preallocations(sb, lg,
+ order, lg_prealloc_count);
+ return;
+ }
+ return ;
+}
+
/*
* release all resource we used in allocation
*/
static int ext4_mb_release_context(struct ext4_allocation_context *ac)
{
- if (ac->ac_pa) {
- if (ac->ac_pa->pa_linear) {
+ struct ext4_prealloc_space *pa = ac->ac_pa;
+ if (pa) {
+ if (pa->pa_linear) {
/* see comment in ext4_mb_use_group_pa() */
- spin_lock(&ac->ac_pa->pa_lock);
- ac->ac_pa->pa_pstart += ac->ac_b_ex.fe_len;
- ac->ac_pa->pa_lstart += ac->ac_b_ex.fe_len;
- ac->ac_pa->pa_free -= ac->ac_b_ex.fe_len;
- ac->ac_pa->pa_len -= ac->ac_b_ex.fe_len;
- spin_unlock(&ac->ac_pa->pa_lock);
+ spin_lock(&pa->pa_lock);
+ pa->pa_pstart += ac->ac_b_ex.fe_len;
+ pa->pa_lstart += ac->ac_b_ex.fe_len;
+ pa->pa_free -= ac->ac_b_ex.fe_len;
+ pa->pa_len -= ac->ac_b_ex.fe_len;
+ spin_unlock(&pa->pa_lock);
+ /*
+ * We want to add the pa to the right bucket.
+ * Remove it from the list and while adding
+ * make sure the list to which we are adding
+ * doesn't grow big.
+ */
+ if (likely(pa->pa_free)) {
+ spin_lock(pa->pa_obj_lock);
+ list_del_rcu(&pa->pa_inode_list);
+ spin_unlock(pa->pa_obj_lock);
+ ext4_mb_add_n_trim(ac);
+ }
}
- ext4_mb_put_pa(ac, ac->ac_sb, ac->ac_pa);
+ ext4_mb_put_pa(ac, ac->ac_sb, pa);
}
if (ac->ac_bitmap_page)
page_cache_release(ac->ac_bitmap_page);
@@ -4000,22 +4278,32 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
struct ext4_allocation_request *ar, int *errp)
{
+ int freed;
struct ext4_allocation_context *ac = NULL;
struct ext4_sb_info *sbi;
struct super_block *sb;
ext4_fsblk_t block = 0;
- int freed;
- int inquota;
+ unsigned long inquota;
+ unsigned long reserv_blks = 0;
sb = ar->inode->i_sb;
sbi = EXT4_SB(sb);
- if (!test_opt(sb, MBALLOC)) {
- block = ext4_new_blocks_old(handle, ar->inode, ar->goal,
- &(ar->len), errp);
- return block;
+ if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
+ /*
+ * With delalloc we already reserved the blocks
+ */
+ while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) {
+ /* let others to free the space */
+ yield();
+ ar->len = ar->len >> 1;
+ }
+ if (!ar->len) {
+ *errp = -ENOSPC;
+ return 0;
+ }
+ reserv_blks = ar->len;
}
-
while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
ar->flags |= EXT4_MB_HINT_NOPREALLOC;
ar->len--;
@@ -4026,10 +4314,14 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
}
inquota = ar->len;
+ if (EXT4_I(ar->inode)->i_delalloc_reserved_flag)
+ ar->flags |= EXT4_MB_DELALLOC_RESERVED;
+
ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
if (!ac) {
+ ar->len = 0;
*errp = -ENOMEM;
- return 0;
+ goto out1;
}
ext4_mb_poll_new_transaction(sb, handle);
@@ -4037,12 +4329,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
*errp = ext4_mb_initialize_context(ac, ar);
if (*errp) {
ar->len = 0;
- goto out;
+ goto out2;
}
ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
if (!ext4_mb_use_preallocated(ac)) {
-
ac->ac_op = EXT4_MB_HISTORY_ALLOC;
ext4_mb_normalize_request(ac, ar);
repeat:
@@ -4058,7 +4349,7 @@ repeat:
}
if (likely(ac->ac_status == AC_STATUS_FOUND)) {
- *errp = ext4_mb_mark_diskspace_used(ac, handle);
+ *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
if (*errp == -EAGAIN) {
ac->ac_b_ex.fe_group = 0;
ac->ac_b_ex.fe_start = 0;
@@ -4085,11 +4376,12 @@ repeat:
ext4_mb_release_context(ac);
-out:
+out2:
+ kmem_cache_free(ext4_ac_cachep, ac);
+out1:
if (ar->len < inquota)
DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
- kmem_cache_free(ext4_ac_cachep, ac);
return block;
}
static void ext4_mb_poll_new_transaction(struct super_block *sb,
@@ -4242,12 +4534,16 @@ do_more:
overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
count -= overflow;
}
- bitmap_bh = read_block_bitmap(sb, block_group);
- if (!bitmap_bh)
+ bitmap_bh = ext4_read_block_bitmap(sb, block_group);
+ if (!bitmap_bh) {
+ err = -EIO;
goto error_return;
+ }
gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
- if (!gdp)
+ if (!gdp) {
+ err = -EIO;
goto error_return;
+ }
if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
@@ -4309,10 +4605,9 @@ do_more:
ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
} else {
ext4_lock_group(sb, block_group);
- err = mb_free_blocks(inode, &e4b, bit, count);
+ mb_free_blocks(inode, &e4b, bit, count);
ext4_mb_return_to_preallocation(inode, &e4b, block, count);
ext4_unlock_group(sb, block_group);
- BUG_ON(err != 0);
}
spin_lock(sb_bgl_lock(sbi, block_group));
@@ -4321,6 +4616,13 @@ do_more:
spin_unlock(sb_bgl_lock(sbi, block_group));
percpu_counter_add(&sbi->s_freeblocks_counter, count);
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+ spin_lock(sb_bgl_lock(sbi, flex_group));
+ sbi->s_flex_groups[flex_group].free_blocks += count;
+ spin_unlock(sb_bgl_lock(sbi, flex_group));
+ }
+
ext4_mb_release_desc(&e4b);
*freed += count;
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index bfe6add46bc..b3b4828f8b8 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -164,11 +164,17 @@ struct ext4_free_extent {
* Locality group:
* we try to group all related changes together
* so that writeback can flush/allocate them together as well
+ * Size of lg_prealloc_list hash is determined by MB_DEFAULT_GROUP_PREALLOC
+ * (512). We store prealloc space into the hash based on the pa_free blocks
+ * order value.ie, fls(pa_free)-1;
*/
+#define PREALLOC_TB_SIZE 10
struct ext4_locality_group {
/* for allocator */
- struct mutex lg_mutex; /* to serialize allocates */
- struct list_head lg_prealloc_list;/* list of preallocations */
+ /* to serialize allocates */
+ struct mutex lg_mutex;
+ /* list of preallocations */
+ struct list_head lg_prealloc_list[PREALLOC_TB_SIZE];
spinlock_t lg_prealloc_lock;
};
@@ -251,7 +257,6 @@ static void ext4_mb_store_history(struct ext4_allocation_context *ac);
#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
-static struct proc_dir_entry *proc_root_ext4;
struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index b9e077ba07e..f2a9cf498ec 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -53,7 +53,8 @@ static int finish_range(handle_t *handle, struct inode *inode,
* credit. But below we try to not accumalate too much
* of them by restarting the journal.
*/
- needed = ext4_ext_calc_credits_for_insert(inode, path);
+ needed = ext4_ext_calc_credits_for_single_extent(inode,
+ lb->last_block - lb->first_block + 1, path);
/*
* Make sure the credit we accumalated is not really high
@@ -446,8 +447,7 @@ static int free_ext_block(handle_t *handle, struct inode *inode)
}
-int ext4_ext_migrate(struct inode *inode, struct file *filp,
- unsigned int cmd, unsigned long arg)
+int ext4_ext_migrate(struct inode *inode)
{
handle_t *handle;
int retval = 0, i;
@@ -515,12 +515,6 @@ int ext4_ext_migrate(struct inode *inode, struct file *filp,
* when we add extents we extent the journal
*/
/*
- * inode_mutex prevent write and truncate on the file. Read still goes
- * through. We take i_data_sem in ext4_ext_swap_inode_data before we
- * switch the inode format to prevent read.
- */
- mutex_lock(&(inode->i_mutex));
- /*
* Even though we take i_mutex we can still cause block allocation
* via mmap write to holes. If we have allocated new blocks we fail
* migrate. New block allocation will clear EXT4_EXT_MIGRATE flag.
@@ -622,7 +616,6 @@ err_out:
tmp_inode->i_nlink = 0;
ext4_journal_stop(handle);
- mutex_unlock(&(inode->i_mutex));
if (tmp_inode)
iput(tmp_inode);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index ab16beaa830..92db9e94514 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -151,38 +151,50 @@ struct dx_map_entry
static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
-static inline unsigned dx_get_hash (struct dx_entry *entry);
-static void dx_set_hash (struct dx_entry *entry, unsigned value);
-static unsigned dx_get_count (struct dx_entry *entries);
-static unsigned dx_get_limit (struct dx_entry *entries);
-static void dx_set_count (struct dx_entry *entries, unsigned value);
-static void dx_set_limit (struct dx_entry *entries, unsigned value);
-static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
-static unsigned dx_node_limit (struct inode *dir);
-static struct dx_frame *dx_probe(struct dentry *dentry,
+static inline unsigned dx_get_hash(struct dx_entry *entry);
+static void dx_set_hash(struct dx_entry *entry, unsigned value);
+static unsigned dx_get_count(struct dx_entry *entries);
+static unsigned dx_get_limit(struct dx_entry *entries);
+static void dx_set_count(struct dx_entry *entries, unsigned value);
+static void dx_set_limit(struct dx_entry *entries, unsigned value);
+static unsigned dx_root_limit(struct inode *dir, unsigned infosize);
+static unsigned dx_node_limit(struct inode *dir);
+static struct dx_frame *dx_probe(const struct qstr *d_name,
struct inode *dir,
struct dx_hash_info *hinfo,
struct dx_frame *frame,
int *err);
-static void dx_release (struct dx_frame *frames);
-static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
- struct dx_hash_info *hinfo, struct dx_map_entry map[]);
+static void dx_release(struct dx_frame *frames);
+static int dx_make_map(struct ext4_dir_entry_2 *de, int size,
+ struct dx_hash_info *hinfo, struct dx_map_entry map[]);
static void dx_sort_map(struct dx_map_entry *map, unsigned count);
-static struct ext4_dir_entry_2 *dx_move_dirents (char *from, char *to,
+static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
struct dx_map_entry *offsets, int count);
-static struct ext4_dir_entry_2* dx_pack_dirents (char *base, int size);
+static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size);
static void dx_insert_block(struct dx_frame *frame,
u32 hash, ext4_lblk_t block);
static int ext4_htree_next_block(struct inode *dir, __u32 hash,
struct dx_frame *frame,
struct dx_frame *frames,
__u32 *start_hash);
-static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
- struct ext4_dir_entry_2 **res_dir, int *err);
+static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
+ const struct qstr *d_name,
+ struct ext4_dir_entry_2 **res_dir,
+ int *err);
static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
struct inode *inode);
/*
+ * p is at least 6 bytes before the end of page
+ */
+static inline struct ext4_dir_entry_2 *
+ext4_next_entry(struct ext4_dir_entry_2 *p)
+{
+ return (struct ext4_dir_entry_2 *)((char *)p +
+ ext4_rec_len_from_disk(p->rec_len));
+}
+
+/*
* Future: use high four bits of block for coalesce-on-delete flags
* Mask them off for now.
*/
@@ -197,59 +209,59 @@ static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
entry->block = cpu_to_le32(value);
}
-static inline unsigned dx_get_hash (struct dx_entry *entry)
+static inline unsigned dx_get_hash(struct dx_entry *entry)
{
return le32_to_cpu(entry->hash);
}
-static inline void dx_set_hash (struct dx_entry *entry, unsigned value)
+static inline void dx_set_hash(struct dx_entry *entry, unsigned value)
{
entry->hash = cpu_to_le32(value);
}
-static inline unsigned dx_get_count (struct dx_entry *entries)
+static inline unsigned dx_get_count(struct dx_entry *entries)
{
return le16_to_cpu(((struct dx_countlimit *) entries)->count);
}
-static inline unsigned dx_get_limit (struct dx_entry *entries)
+static inline unsigned dx_get_limit(struct dx_entry *entries)
{
return le16_to_cpu(((struct dx_countlimit *) entries)->limit);
}
-static inline void dx_set_count (struct dx_entry *entries, unsigned value)
+static inline void dx_set_count(struct dx_entry *entries, unsigned value)
{
((struct dx_countlimit *) entries)->count = cpu_to_le16(value);
}
-static inline void dx_set_limit (struct dx_entry *entries, unsigned value)
+static inline void dx_set_limit(struct dx_entry *entries, unsigned value)
{
((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
}
-static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
+static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
{
unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
EXT4_DIR_REC_LEN(2) - infosize;
- return 0? 20: entry_space / sizeof(struct dx_entry);
+ return entry_space / sizeof(struct dx_entry);
}
-static inline unsigned dx_node_limit (struct inode *dir)
+static inline unsigned dx_node_limit(struct inode *dir)
{
unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
- return 0? 22: entry_space / sizeof(struct dx_entry);
+ return entry_space / sizeof(struct dx_entry);
}
/*
* Debug
*/
#ifdef DX_DEBUG
-static void dx_show_index (char * label, struct dx_entry *entries)
+static void dx_show_index(char * label, struct dx_entry *entries)
{
int i, n = dx_get_count (entries);
- printk("%s index ", label);
+ printk(KERN_DEBUG "%s index ", label);
for (i = 0; i < n; i++) {
- printk("%x->%lu ", i? dx_get_hash(entries + i) :
+ printk("%x->%lu ", i ? dx_get_hash(entries + i) :
0, (unsigned long)dx_get_block(entries + i));
}
printk("\n");
@@ -296,7 +308,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
struct dx_entry *entries, int levels)
{
unsigned blocksize = dir->i_sb->s_blocksize;
- unsigned count = dx_get_count (entries), names = 0, space = 0, i;
+ unsigned count = dx_get_count(entries), names = 0, space = 0, i;
unsigned bcount = 0;
struct buffer_head *bh;
int err;
@@ -315,11 +327,12 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
names += stats.names;
space += stats.space;
bcount += stats.bcount;
- brelse (bh);
+ brelse(bh);
}
if (bcount)
- printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ",
- names, space/bcount,(space/bcount)*100/blocksize);
+ printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n",
+ levels ? "" : " ", names, space/bcount,
+ (space/bcount)*100/blocksize);
return (struct stats) { names, space, bcount};
}
#endif /* DX_DEBUG */
@@ -334,7 +347,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
* back to userspace.
*/
static struct dx_frame *
-dx_probe(struct dentry *dentry, struct inode *dir,
+dx_probe(const struct qstr *d_name, struct inode *dir,
struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
{
unsigned count, indirect;
@@ -345,8 +358,6 @@ dx_probe(struct dentry *dentry, struct inode *dir,
u32 hash;
frame->bh = NULL;
- if (dentry)
- dir = dentry->d_parent->d_inode;
if (!(bh = ext4_bread (NULL,dir, 0, 0, err)))
goto fail;
root = (struct dx_root *) bh->b_data;
@@ -362,8 +373,8 @@ dx_probe(struct dentry *dentry, struct inode *dir,
}
hinfo->hash_version = root->info.hash_version;
hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
- if (dentry)
- ext4fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
+ if (d_name)
+ ext4fs_dirhash(d_name->name, d_name->len, hinfo);
hash = hinfo->hash;
if (root->info.unused_flags & 1) {
@@ -396,7 +407,7 @@ dx_probe(struct dentry *dentry, struct inode *dir,
goto fail;
}
- dxtrace (printk("Look up %x", hash));
+ dxtrace(printk("Look up %x", hash));
while (1)
{
count = dx_get_count(entries);
@@ -545,7 +556,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
0, &err)))
return err; /* Failure */
p++;
- brelse (p->bh);
+ brelse(p->bh);
p->bh = bh;
p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
}
@@ -554,15 +565,6 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
/*
- * p is at least 6 bytes before the end of page
- */
-static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *p)
-{
- return (struct ext4_dir_entry_2 *)((char *)p +
- ext4_rec_len_from_disk(p->rec_len));
-}
-
-/*
* This function fills a red-black tree with information from a
* directory block. It returns the number directory entries loaded
* into the tree. If there is an error it is returned in err.
@@ -592,7 +594,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
/* On error, skip the f_pos to the next block. */
dir_file->f_pos = (dir_file->f_pos |
(dir->i_sb->s_blocksize - 1)) + 1;
- brelse (bh);
+ brelse(bh);
return count;
}
ext4fs_dirhash(de->name, de->name_len, hinfo);
@@ -634,8 +636,8 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
int ret, err;
__u32 hashval;
- dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
- start_minor_hash));
+ dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
+ start_hash, start_minor_hash));
dir = dir_file->f_path.dentry->d_inode;
if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
@@ -647,7 +649,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
}
hinfo.hash = start_hash;
hinfo.minor_hash = 0;
- frame = dx_probe(NULL, dir_file->f_path.dentry->d_inode, &hinfo, frames, &err);
+ frame = dx_probe(NULL, dir, &hinfo, frames, &err);
if (!frame)
return err;
@@ -693,8 +695,8 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
break;
}
dx_release(frames);
- dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n",
- count, *next_hash));
+ dxtrace(printk(KERN_DEBUG "Fill tree: returned %d entries, "
+ "next hash: %x\n", count, *next_hash));
return count;
errout:
dx_release(frames);
@@ -801,17 +803,17 @@ static inline int ext4_match (int len, const char * const name,
/*
* Returns 0 if not found, -1 on failure, and 1 on success
*/
-static inline int search_dirblock(struct buffer_head * bh,
+static inline int search_dirblock(struct buffer_head *bh,
struct inode *dir,
- struct dentry *dentry,
+ const struct qstr *d_name,
unsigned long offset,
struct ext4_dir_entry_2 ** res_dir)
{
struct ext4_dir_entry_2 * de;
char * dlimit;
int de_len;
- const char *name = dentry->d_name.name;
- int namelen = dentry->d_name.len;
+ const char *name = d_name->name;
+ int namelen = d_name->len;
de = (struct ext4_dir_entry_2 *) bh->b_data;
dlimit = bh->b_data + dir->i_sb->s_blocksize;
@@ -850,12 +852,13 @@ static inline int search_dirblock(struct buffer_head * bh,
* The returned buffer_head has ->b_count elevated. The caller is expected
* to brelse() it when appropriate.
*/
-static struct buffer_head * ext4_find_entry (struct dentry *dentry,
+static struct buffer_head * ext4_find_entry (struct inode *dir,
+ const struct qstr *d_name,
struct ext4_dir_entry_2 ** res_dir)
{
- struct super_block * sb;
- struct buffer_head * bh_use[NAMEI_RA_SIZE];
- struct buffer_head * bh, *ret = NULL;
+ struct super_block *sb;
+ struct buffer_head *bh_use[NAMEI_RA_SIZE];
+ struct buffer_head *bh, *ret = NULL;
ext4_lblk_t start, block, b;
int ra_max = 0; /* Number of bh's in the readahead
buffer, bh_use[] */
@@ -864,16 +867,15 @@ static struct buffer_head * ext4_find_entry (struct dentry *dentry,
int num = 0;
ext4_lblk_t nblocks;
int i, err;
- struct inode *dir = dentry->d_parent->d_inode;
int namelen;
*res_dir = NULL;
sb = dir->i_sb;
- namelen = dentry->d_name.len;
+ namelen = d_name->len;
if (namelen > EXT4_NAME_LEN)
return NULL;
if (is_dx(dir)) {
- bh = ext4_dx_find_entry(dentry, res_dir, &err);
+ bh = ext4_dx_find_entry(dir, d_name, res_dir, &err);
/*
* On success, or if the error was file not found,
* return. Otherwise, fall back to doing a search the
@@ -881,7 +883,8 @@ static struct buffer_head * ext4_find_entry (struct dentry *dentry,
*/
if (bh || (err != ERR_BAD_DX_DIR))
return bh;
- dxtrace(printk("ext4_find_entry: dx failed, falling back\n"));
+ dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
+ "falling back\n"));
}
nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
start = EXT4_I(dir)->i_dir_start_lookup;
@@ -925,7 +928,7 @@ restart:
brelse(bh);
goto next;
}
- i = search_dirblock(bh, dir, dentry,
+ i = search_dirblock(bh, dir, d_name,
block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
if (i == 1) {
EXT4_I(dir)->i_dir_start_lookup = block;
@@ -955,11 +958,11 @@ restart:
cleanup_and_exit:
/* Clean up the read-ahead blocks */
for (; ra_ptr < ra_max; ra_ptr++)
- brelse (bh_use[ra_ptr]);
+ brelse(bh_use[ra_ptr]);
return ret;
}
-static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
+static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name,
struct ext4_dir_entry_2 **res_dir, int *err)
{
struct super_block * sb;
@@ -970,14 +973,13 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
struct buffer_head *bh;
ext4_lblk_t block;
int retval;
- int namelen = dentry->d_name.len;
- const u8 *name = dentry->d_name.name;
- struct inode *dir = dentry->d_parent->d_inode;
+ int namelen = d_name->len;
+ const u8 *name = d_name->name;
sb = dir->i_sb;
/* NFS may look up ".." - look at dx_root directory block */
if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){
- if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err)))
+ if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err)))
return NULL;
} else {
frame = frames;
@@ -993,21 +995,23 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
de = (struct ext4_dir_entry_2 *) bh->b_data;
top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
EXT4_DIR_REC_LEN(0));
- for (; de < top; de = ext4_next_entry(de))
- if (ext4_match (namelen, name, de)) {
- if (!ext4_check_dir_entry("ext4_find_entry",
- dir, de, bh,
- (block<<EXT4_BLOCK_SIZE_BITS(sb))
- +((char *)de - bh->b_data))) {
- brelse (bh);
+ for (; de < top; de = ext4_next_entry(de)) {
+ int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
+ + ((char *) de - bh->b_data);
+
+ if (!ext4_check_dir_entry(__func__, dir, de, bh, off)) {
+ brelse(bh);
*err = ERR_BAD_DX_DIR;
goto errout;
}
- *res_dir = de;
- dx_release (frames);
- return bh;
+
+ if (ext4_match(namelen, name, de)) {
+ *res_dir = de;
+ dx_release(frames);
+ return bh;
+ }
}
- brelse (bh);
+ brelse(bh);
/* Check to see if we should continue to search */
retval = ext4_htree_next_block(dir, hash, frame,
frames, NULL);
@@ -1022,25 +1026,25 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
*err = -ENOENT;
errout:
- dxtrace(printk("%s not found\n", name));
+ dxtrace(printk(KERN_DEBUG "%s not found\n", name));
dx_release (frames);
return NULL;
}
-static struct dentry *ext4_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
+static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
{
- struct inode * inode;
- struct ext4_dir_entry_2 * de;
- struct buffer_head * bh;
+ struct inode *inode;
+ struct ext4_dir_entry_2 *de;
+ struct buffer_head *bh;
if (dentry->d_name.len > EXT4_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
- bh = ext4_find_entry(dentry, &de);
+ bh = ext4_find_entry(dir, &dentry->d_name, &de);
inode = NULL;
if (bh) {
unsigned long ino = le32_to_cpu(de->inode);
- brelse (bh);
+ brelse(bh);
if (!ext4_valid_inum(dir->i_sb, ino)) {
ext4_error(dir->i_sb, "ext4_lookup",
"bad inode number: %lu", ino);
@@ -1059,15 +1063,14 @@ struct dentry *ext4_get_parent(struct dentry *child)
unsigned long ino;
struct dentry *parent;
struct inode *inode;
- struct dentry dotdot;
+ static const struct qstr dotdot = {
+ .name = "..",
+ .len = 2,
+ };
struct ext4_dir_entry_2 * de;
struct buffer_head *bh;
- dotdot.d_name.name = "..";
- dotdot.d_name.len = 2;
- dotdot.d_parent = child; /* confusing, isn't it! */
-
- bh = ext4_find_entry(&dotdot, &de);
+ bh = ext4_find_entry(child->d_inode, &dotdot, &de);
inode = NULL;
if (!bh)
return ERR_PTR(-ENOENT);
@@ -1198,10 +1201,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
/* create map in the end of data2 block */
map = (struct dx_map_entry *) (data2 + blocksize);
- count = dx_make_map ((struct ext4_dir_entry_2 *) data1,
+ count = dx_make_map((struct ext4_dir_entry_2 *) data1,
blocksize, hinfo, map);
map -= count;
- dx_sort_map (map, count);
+ dx_sort_map(map, count);
/* Split the existing block in the middle, size-wise */
size = 0;
move = 0;
@@ -1222,7 +1225,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
/* Fancy dance to stay within two buffers */
de2 = dx_move_dirents(data1, data2, map + split, count - split);
- de = dx_pack_dirents(data1,blocksize);
+ de = dx_pack_dirents(data1, blocksize);
de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de);
de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2);
dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
@@ -1234,15 +1237,15 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
swap(*bh, bh2);
de = de2;
}
- dx_insert_block (frame, hash2 + continued, newblock);
- err = ext4_journal_dirty_metadata (handle, bh2);
+ dx_insert_block(frame, hash2 + continued, newblock);
+ err = ext4_journal_dirty_metadata(handle, bh2);
if (err)
goto journal_error;
- err = ext4_journal_dirty_metadata (handle, frame->bh);
+ err = ext4_journal_dirty_metadata(handle, frame->bh);
if (err)
goto journal_error;
- brelse (bh2);
- dxtrace(dx_show_index ("frame", frame->entries));
+ brelse(bh2);
+ dxtrace(dx_show_index("frame", frame->entries));
return de;
journal_error:
@@ -1268,7 +1271,7 @@ errout:
*/
static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
struct inode *inode, struct ext4_dir_entry_2 *de,
- struct buffer_head * bh)
+ struct buffer_head *bh)
{
struct inode *dir = dentry->d_parent->d_inode;
const char *name = dentry->d_name.name;
@@ -1285,11 +1288,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
while ((char *) de <= top) {
if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
bh, offset)) {
- brelse (bh);
+ brelse(bh);
return -EIO;
}
- if (ext4_match (namelen, name, de)) {
- brelse (bh);
+ if (ext4_match(namelen, name, de)) {
+ brelse(bh);
return -EEXIST;
}
nlen = EXT4_DIR_REC_LEN(de->name_len);
@@ -1326,7 +1329,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
} else
de->inode = 0;
de->name_len = namelen;
- memcpy (de->name, name, namelen);
+ memcpy(de->name, name, namelen);
/*
* XXX shouldn't update any times until successful
* completion of syscall, but too many callers depend
@@ -1374,7 +1377,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
struct fake_dirent *fde;
blocksize = dir->i_sb->s_blocksize;
- dxtrace(printk("Creating index\n"));
+ dxtrace(printk(KERN_DEBUG "Creating index\n"));
retval = ext4_journal_get_write_access(handle, bh);
if (retval) {
ext4_std_error(dir->i_sb, retval);
@@ -1383,7 +1386,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
}
root = (struct dx_root *) bh->b_data;
- bh2 = ext4_append (handle, dir, &block, &retval);
+ bh2 = ext4_append(handle, dir, &block, &retval);
if (!(bh2)) {
brelse(bh);
return retval;
@@ -1409,9 +1412,9 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
root->info.info_length = sizeof(root->info);
root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
entries = root->entries;
- dx_set_block (entries, 1);
- dx_set_count (entries, 1);
- dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info)));
+ dx_set_block(entries, 1);
+ dx_set_count(entries, 1);
+ dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info)));
/* Initialize as for dx_probe */
hinfo.hash_version = root->info.hash_version;
@@ -1440,14 +1443,14 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
* may not sleep between calling this and putting something into
* the entry, as someone else might have used it while you slept.
*/
-static int ext4_add_entry (handle_t *handle, struct dentry *dentry,
- struct inode *inode)
+static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
+ struct inode *inode)
{
struct inode *dir = dentry->d_parent->d_inode;
unsigned long offset;
- struct buffer_head * bh;
+ struct buffer_head *bh;
struct ext4_dir_entry_2 *de;
- struct super_block * sb;
+ struct super_block *sb;
int retval;
int dx_fallback=0;
unsigned blocksize;
@@ -1497,13 +1500,13 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
struct dx_frame frames[2], *frame;
struct dx_entry *entries, *at;
struct dx_hash_info hinfo;
- struct buffer_head * bh;
+ struct buffer_head *bh;
struct inode *dir = dentry->d_parent->d_inode;
- struct super_block * sb = dir->i_sb;
+ struct super_block *sb = dir->i_sb;
struct ext4_dir_entry_2 *de;
int err;
- frame = dx_probe(dentry, NULL, &hinfo, frames, &err);
+ frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err);
if (!frame)
return err;
entries = frame->entries;
@@ -1524,7 +1527,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
}
/* Block full, should compress but for now just split */
- dxtrace(printk("using %u of %u node entries\n",
+ dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
dx_get_count(entries), dx_get_limit(entries)));
/* Need to split index? */
if (dx_get_count(entries) == dx_get_limit(entries)) {
@@ -1556,7 +1559,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
if (levels) {
unsigned icount1 = icount/2, icount2 = icount - icount1;
unsigned hash2 = dx_get_hash(entries + icount1);
- dxtrace(printk("Split index %i/%i\n", icount1, icount2));
+ dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
+ icount1, icount2));
BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
err = ext4_journal_get_write_access(handle,
@@ -1564,11 +1568,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
if (err)
goto journal_error;
- memcpy ((char *) entries2, (char *) (entries + icount1),
- icount2 * sizeof(struct dx_entry));
- dx_set_count (entries, icount1);
- dx_set_count (entries2, icount2);
- dx_set_limit (entries2, dx_node_limit(dir));
+ memcpy((char *) entries2, (char *) (entries + icount1),
+ icount2 * sizeof(struct dx_entry));
+ dx_set_count(entries, icount1);
+ dx_set_count(entries2, icount2);
+ dx_set_limit(entries2, dx_node_limit(dir));
/* Which index block gets the new entry? */
if (at - entries >= icount1) {
@@ -1576,16 +1580,17 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
frame->entries = entries = entries2;
swap(frame->bh, bh2);
}
- dx_insert_block (frames + 0, hash2, newblock);
- dxtrace(dx_show_index ("node", frames[1].entries));
- dxtrace(dx_show_index ("node",
+ dx_insert_block(frames + 0, hash2, newblock);
+ dxtrace(dx_show_index("node", frames[1].entries));
+ dxtrace(dx_show_index("node",
((struct dx_node *) bh2->b_data)->entries));
err = ext4_journal_dirty_metadata(handle, bh2);
if (err)
goto journal_error;
brelse (bh2);
} else {
- dxtrace(printk("Creating second level index...\n"));
+ dxtrace(printk(KERN_DEBUG
+ "Creating second level index...\n"));
memcpy((char *) entries2, (char *) entries,
icount * sizeof(struct dx_entry));
dx_set_limit(entries2, dx_node_limit(dir));
@@ -1627,12 +1632,12 @@ cleanup:
* ext4_delete_entry deletes a directory entry by merging it with the
* previous entry
*/
-static int ext4_delete_entry (handle_t *handle,
- struct inode * dir,
- struct ext4_dir_entry_2 * de_del,
- struct buffer_head * bh)
+static int ext4_delete_entry(handle_t *handle,
+ struct inode *dir,
+ struct ext4_dir_entry_2 *de_del,
+ struct buffer_head *bh)
{
- struct ext4_dir_entry_2 * de, * pde;
+ struct ext4_dir_entry_2 *de, *pde;
int i;
i = 0;
@@ -1713,11 +1718,11 @@ static int ext4_add_nondir(handle_t *handle,
* If the create succeeds, we fill in the inode information
* with d_instantiate().
*/
-static int ext4_create (struct inode * dir, struct dentry * dentry, int mode,
- struct nameidata *nd)
+static int ext4_create(struct inode *dir, struct dentry *dentry, int mode,
+ struct nameidata *nd)
{
handle_t *handle;
- struct inode * inode;
+ struct inode *inode;
int err, retries = 0;
retry:
@@ -1744,8 +1749,8 @@ retry:
return err;
}
-static int ext4_mknod (struct inode * dir, struct dentry *dentry,
- int mode, dev_t rdev)
+static int ext4_mknod(struct inode *dir, struct dentry *dentry,
+ int mode, dev_t rdev)
{
handle_t *handle;
struct inode *inode;
@@ -1764,11 +1769,11 @@ retry:
if (IS_DIRSYNC(dir))
handle->h_sync = 1;
- inode = ext4_new_inode (handle, dir, mode);
+ inode = ext4_new_inode(handle, dir, mode);
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
init_special_inode(inode, inode->i_mode, rdev);
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
inode->i_op = &ext4_special_inode_operations;
#endif
err = ext4_add_nondir(handle, dentry, inode);
@@ -1779,12 +1784,12 @@ retry:
return err;
}
-static int ext4_mkdir(struct inode * dir, struct dentry * dentry, int mode)
+static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
{
handle_t *handle;
- struct inode * inode;
- struct buffer_head * dir_block;
- struct ext4_dir_entry_2 * de;
+ struct inode *inode;
+ struct buffer_head *dir_block;
+ struct ext4_dir_entry_2 *de;
int err, retries = 0;
if (EXT4_DIR_LINK_MAX(dir))
@@ -1800,7 +1805,7 @@ retry:
if (IS_DIRSYNC(dir))
handle->h_sync = 1;
- inode = ext4_new_inode (handle, dir, S_IFDIR | mode);
+ inode = ext4_new_inode(handle, dir, S_IFDIR | mode);
err = PTR_ERR(inode);
if (IS_ERR(inode))
goto out_stop;
@@ -1808,7 +1813,7 @@ retry:
inode->i_op = &ext4_dir_inode_operations;
inode->i_fop = &ext4_dir_operations;
inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
- dir_block = ext4_bread (handle, inode, 0, 1, &err);
+ dir_block = ext4_bread(handle, inode, 0, 1, &err);
if (!dir_block)
goto out_clear_inode;
BUFFER_TRACE(dir_block, "get_write_access");
@@ -1817,26 +1822,26 @@ retry:
de->inode = cpu_to_le32(inode->i_ino);
de->name_len = 1;
de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len));
- strcpy (de->name, ".");
+ strcpy(de->name, ".");
ext4_set_de_type(dir->i_sb, de, S_IFDIR);
de = ext4_next_entry(de);
de->inode = cpu_to_le32(dir->i_ino);
de->rec_len = ext4_rec_len_to_disk(inode->i_sb->s_blocksize -
EXT4_DIR_REC_LEN(1));
de->name_len = 2;
- strcpy (de->name, "..");
+ strcpy(de->name, "..");
ext4_set_de_type(dir->i_sb, de, S_IFDIR);
inode->i_nlink = 2;
BUFFER_TRACE(dir_block, "call ext4_journal_dirty_metadata");
ext4_journal_dirty_metadata(handle, dir_block);
- brelse (dir_block);
+ brelse(dir_block);
ext4_mark_inode_dirty(handle, inode);
- err = ext4_add_entry (handle, dentry, inode);
+ err = ext4_add_entry(handle, dentry, inode);
if (err) {
out_clear_inode:
clear_nlink(inode);
ext4_mark_inode_dirty(handle, inode);
- iput (inode);
+ iput(inode);
goto out_stop;
}
ext4_inc_count(handle, dir);
@@ -1853,17 +1858,17 @@ out_stop:
/*
* routine to check that the specified directory is empty (for rmdir)
*/
-static int empty_dir (struct inode * inode)
+static int empty_dir(struct inode *inode)
{
unsigned long offset;
- struct buffer_head * bh;
- struct ext4_dir_entry_2 * de, * de1;
- struct super_block * sb;
+ struct buffer_head *bh;
+ struct ext4_dir_entry_2 *de, *de1;
+ struct super_block *sb;
int err = 0;
sb = inode->i_sb;
if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
- !(bh = ext4_bread (NULL, inode, 0, 0, &err))) {
+ !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
if (err)
ext4_error(inode->i_sb, __func__,
"error %d reading directory #%lu offset 0",
@@ -1878,23 +1883,23 @@ static int empty_dir (struct inode * inode)
de1 = ext4_next_entry(de);
if (le32_to_cpu(de->inode) != inode->i_ino ||
!le32_to_cpu(de1->inode) ||
- strcmp (".", de->name) ||
- strcmp ("..", de1->name)) {
- ext4_warning (inode->i_sb, "empty_dir",
- "bad directory (dir #%lu) - no `.' or `..'",
- inode->i_ino);
- brelse (bh);
+ strcmp(".", de->name) ||
+ strcmp("..", de1->name)) {
+ ext4_warning(inode->i_sb, "empty_dir",
+ "bad directory (dir #%lu) - no `.' or `..'",
+ inode->i_ino);
+ brelse(bh);
return 1;
}
offset = ext4_rec_len_from_disk(de->rec_len) +
ext4_rec_len_from_disk(de1->rec_len);
de = ext4_next_entry(de1);
- while (offset < inode->i_size ) {
+ while (offset < inode->i_size) {
if (!bh ||
(void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
err = 0;
- brelse (bh);
- bh = ext4_bread (NULL, inode,
+ brelse(bh);
+ bh = ext4_bread(NULL, inode,
offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err);
if (!bh) {
if (err)
@@ -1914,13 +1919,13 @@ static int empty_dir (struct inode * inode)
continue;
}
if (le32_to_cpu(de->inode)) {
- brelse (bh);
+ brelse(bh);
return 0;
}
offset += ext4_rec_len_from_disk(de->rec_len);
de = ext4_next_entry(de);
}
- brelse (bh);
+ brelse(bh);
return 1;
}
@@ -1951,8 +1956,8 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
* ->i_nlink. For, say it, character device. Not a regular file,
* not a directory, not a symlink and ->i_nlink > 0.
*/
- J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
+ J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
@@ -2066,12 +2071,12 @@ out_brelse:
goto out_err;
}
-static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
+static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
{
int retval;
- struct inode * inode;
- struct buffer_head * bh;
- struct ext4_dir_entry_2 * de;
+ struct inode *inode;
+ struct buffer_head *bh;
+ struct ext4_dir_entry_2 *de;
handle_t *handle;
/* Initialize quotas before so that eventual writes go in
@@ -2082,7 +2087,7 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
return PTR_ERR(handle);
retval = -ENOENT;
- bh = ext4_find_entry (dentry, &de);
+ bh = ext4_find_entry(dir, &dentry->d_name, &de);
if (!bh)
goto end_rmdir;
@@ -2096,16 +2101,16 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
goto end_rmdir;
retval = -ENOTEMPTY;
- if (!empty_dir (inode))
+ if (!empty_dir(inode))
goto end_rmdir;
retval = ext4_delete_entry(handle, dir, de, bh);
if (retval)
goto end_rmdir;
if (!EXT4_DIR_LINK_EMPTY(inode))
- ext4_warning (inode->i_sb, "ext4_rmdir",
- "empty directory has too many links (%d)",
- inode->i_nlink);
+ ext4_warning(inode->i_sb, "ext4_rmdir",
+ "empty directory has too many links (%d)",
+ inode->i_nlink);
inode->i_version++;
clear_nlink(inode);
/* There's no need to set i_disksize: the fact that i_nlink is
@@ -2121,16 +2126,16 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry)
end_rmdir:
ext4_journal_stop(handle);
- brelse (bh);
+ brelse(bh);
return retval;
}
-static int ext4_unlink(struct inode * dir, struct dentry *dentry)
+static int ext4_unlink(struct inode *dir, struct dentry *dentry)
{
int retval;
- struct inode * inode;
- struct buffer_head * bh;
- struct ext4_dir_entry_2 * de;
+ struct inode *inode;
+ struct buffer_head *bh;
+ struct ext4_dir_entry_2 *de;
handle_t *handle;
/* Initialize quotas before so that eventual writes go
@@ -2144,7 +2149,7 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry)
handle->h_sync = 1;
retval = -ENOENT;
- bh = ext4_find_entry (dentry, &de);
+ bh = ext4_find_entry(dir, &dentry->d_name, &de);
if (!bh)
goto end_unlink;
@@ -2155,9 +2160,9 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry)
goto end_unlink;
if (!inode->i_nlink) {
- ext4_warning (inode->i_sb, "ext4_unlink",
- "Deleting nonexistent file (%lu), %d",
- inode->i_ino, inode->i_nlink);
+ ext4_warning(inode->i_sb, "ext4_unlink",
+ "Deleting nonexistent file (%lu), %d",
+ inode->i_ino, inode->i_nlink);
inode->i_nlink = 1;
}
retval = ext4_delete_entry(handle, dir, de, bh);
@@ -2175,15 +2180,15 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry)
end_unlink:
ext4_journal_stop(handle);
- brelse (bh);
+ brelse(bh);
return retval;
}
-static int ext4_symlink (struct inode * dir,
- struct dentry *dentry, const char * symname)
+static int ext4_symlink(struct inode *dir,
+ struct dentry *dentry, const char *symname)
{
handle_t *handle;
- struct inode * inode;
+ struct inode *inode;
int l, err, retries = 0;
l = strlen(symname)+1;
@@ -2200,12 +2205,12 @@ retry:
if (IS_DIRSYNC(dir))
handle->h_sync = 1;
- inode = ext4_new_inode (handle, dir, S_IFLNK|S_IRWXUGO);
+ inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO);
err = PTR_ERR(inode);
if (IS_ERR(inode))
goto out_stop;
- if (l > sizeof (EXT4_I(inode)->i_data)) {
+ if (l > sizeof(EXT4_I(inode)->i_data)) {
inode->i_op = &ext4_symlink_inode_operations;
ext4_set_aops(inode);
/*
@@ -2218,14 +2223,14 @@ retry:
if (err) {
clear_nlink(inode);
ext4_mark_inode_dirty(handle, inode);
- iput (inode);
+ iput(inode);
goto out_stop;
}
} else {
/* clear the extent format for fast symlink */
EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL;
inode->i_op = &ext4_fast_symlink_inode_operations;
- memcpy((char*)&EXT4_I(inode)->i_data,symname,l);
+ memcpy((char *)&EXT4_I(inode)->i_data, symname, l);
inode->i_size = l-1;
}
EXT4_I(inode)->i_disksize = inode->i_size;
@@ -2237,8 +2242,8 @@ out_stop:
return err;
}
-static int ext4_link (struct dentry * old_dentry,
- struct inode * dir, struct dentry *dentry)
+static int ext4_link(struct dentry *old_dentry,
+ struct inode *dir, struct dentry *dentry)
{
handle_t *handle;
struct inode *inode = old_dentry->d_inode;
@@ -2281,13 +2286,13 @@ retry:
* Anybody can rename anything with this: the permission checks are left to the
* higher-level routines.
*/
-static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
- struct inode * new_dir,struct dentry *new_dentry)
+static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
{
handle_t *handle;
- struct inode * old_inode, * new_inode;
- struct buffer_head * old_bh, * new_bh, * dir_bh;
- struct ext4_dir_entry_2 * old_de, * new_de;
+ struct inode *old_inode, *new_inode;
+ struct buffer_head *old_bh, *new_bh, *dir_bh;
+ struct ext4_dir_entry_2 *old_de, *new_de;
int retval;
old_bh = new_bh = dir_bh = NULL;
@@ -2305,7 +2310,7 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
handle->h_sync = 1;
- old_bh = ext4_find_entry (old_dentry, &old_de);
+ old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de);
/*
* Check for inode number is _not_ due to possible IO errors.
* We might rmdir the source, keep it as pwd of some process
@@ -2318,32 +2323,32 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
goto end_rename;
new_inode = new_dentry->d_inode;
- new_bh = ext4_find_entry (new_dentry, &new_de);
+ new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de);
if (new_bh) {
if (!new_inode) {
- brelse (new_bh);
+ brelse(new_bh);
new_bh = NULL;
}
}
if (S_ISDIR(old_inode->i_mode)) {
if (new_inode) {
retval = -ENOTEMPTY;
- if (!empty_dir (new_inode))
+ if (!empty_dir(new_inode))
goto end_rename;
}
retval = -EIO;
- dir_bh = ext4_bread (handle, old_inode, 0, 0, &retval);
+ dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval);
if (!dir_bh)
goto end_rename;
if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
goto end_rename;
retval = -EMLINK;
- if (!new_inode && new_dir!=old_dir &&
+ if (!new_inode && new_dir != old_dir &&
new_dir->i_nlink >= EXT4_LINK_MAX)
goto end_rename;
}
if (!new_bh) {
- retval = ext4_add_entry (handle, new_dentry, old_inode);
+ retval = ext4_add_entry(handle, new_dentry, old_inode);
if (retval)
goto end_rename;
} else {
@@ -2385,7 +2390,7 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
struct buffer_head *old_bh2;
struct ext4_dir_entry_2 *old_de2;
- old_bh2 = ext4_find_entry(old_dentry, &old_de2);
+ old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2);
if (old_bh2) {
retval = ext4_delete_entry(handle, old_dir,
old_de2, old_bh2);
@@ -2430,9 +2435,9 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry,
retval = 0;
end_rename:
- brelse (dir_bh);
- brelse (old_bh);
- brelse (new_bh);
+ brelse(dir_bh);
+ brelse(old_bh);
+ brelse(new_bh);
ext4_journal_stop(handle);
return retval;
}
@@ -2451,7 +2456,7 @@ const struct inode_operations ext4_dir_inode_operations = {
.mknod = ext4_mknod,
.rename = ext4_rename,
.setattr = ext4_setattr,
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
@@ -2462,7 +2467,7 @@ const struct inode_operations ext4_dir_inode_operations = {
const struct inode_operations ext4_special_inode_operations = {
.setattr = ext4_setattr,
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 9ff7b1c0423..b6ec1843a01 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -73,7 +73,7 @@ static int verify_group_input(struct super_block *sb,
"Inode bitmap not in group (block %llu)",
(unsigned long long)input->inode_bitmap);
else if (outside(input->inode_table, start, end) ||
- outside(itend - 1, start, end))
+ outside(itend - 1, start, end))
ext4_warning(sb, __func__,
"Inode table not in group (blocks %llu-%llu)",
(unsigned long long)input->inode_table, itend - 1);
@@ -104,7 +104,7 @@ static int verify_group_input(struct super_block *sb,
(unsigned long long)input->inode_bitmap,
start, metaend - 1);
else if (inside(input->inode_table, start, metaend) ||
- inside(itend - 1, start, metaend))
+ inside(itend - 1, start, metaend))
ext4_warning(sb, __func__,
"Inode table (%llu-%llu) overlaps"
"GDT table (%llu-%llu)",
@@ -158,9 +158,9 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh,
if (err) {
if ((err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
return err;
- if ((err = ext4_journal_get_write_access(handle, bh)))
+ if ((err = ext4_journal_get_write_access(handle, bh)))
return err;
- }
+ }
return 0;
}
@@ -418,9 +418,9 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
/*
* If we are not using the primary superblock/GDT copy don't resize,
- * because the user tools have no way of handling this. Probably a
- * bad time to do it anyways.
- */
+ * because the user tools have no way of handling this. Probably a
+ * bad time to do it anyways.
+ */
if (EXT4_SB(sb)->s_sbh->b_blocknr !=
le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
ext4_warning(sb, __func__,
@@ -507,14 +507,14 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
return 0;
exit_inode:
- //ext4_journal_release_buffer(handle, iloc.bh);
+ /* ext4_journal_release_buffer(handle, iloc.bh); */
brelse(iloc.bh);
exit_dindj:
- //ext4_journal_release_buffer(handle, dind);
+ /* ext4_journal_release_buffer(handle, dind); */
exit_primary:
- //ext4_journal_release_buffer(handle, *primary);
+ /* ext4_journal_release_buffer(handle, *primary); */
exit_sbh:
- //ext4_journal_release_buffer(handle, *primary);
+ /* ext4_journal_release_buffer(handle, *primary); */
exit_dind:
brelse(dind);
exit_bh:
@@ -773,7 +773,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
if (reserved_gdb || gdb_off == 0) {
if (!EXT4_HAS_COMPAT_FEATURE(sb,
- EXT4_FEATURE_COMPAT_RESIZE_INODE)){
+ EXT4_FEATURE_COMPAT_RESIZE_INODE)
+ || !le16_to_cpu(es->s_reserved_gdt_blocks)) {
ext4_warning(sb, __func__,
"No reserved GDT blocks, can't resize");
return -EPERM;
@@ -818,12 +819,12 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh)))
goto exit_journal;
- /*
- * We will only either add reserved group blocks to a backup group
- * or remove reserved blocks for the first group in a new group block.
- * Doing both would be mean more complex code, and sane people don't
- * use non-sparse filesystems anymore. This is already checked above.
- */
+ /*
+ * We will only either add reserved group blocks to a backup group
+ * or remove reserved blocks for the first group in a new group block.
+ * Doing both would be mean more complex code, and sane people don't
+ * use non-sparse filesystems anymore. This is already checked above.
+ */
if (gdb_off) {
primary = sbi->s_group_desc[gdb_num];
if ((err = ext4_journal_get_write_access(handle, primary)))
@@ -835,24 +836,24 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
} else if ((err = add_new_gdb(handle, inode, input, &primary)))
goto exit_journal;
- /*
- * OK, now we've set up the new group. Time to make it active.
- *
- * Current kernels don't lock all allocations via lock_super(),
- * so we have to be safe wrt. concurrent accesses the group
- * data. So we need to be careful to set all of the relevant
- * group descriptor data etc. *before* we enable the group.
- *
- * The key field here is sbi->s_groups_count: as long as
- * that retains its old value, nobody is going to access the new
- * group.
- *
- * So first we update all the descriptor metadata for the new
- * group; then we update the total disk blocks count; then we
- * update the groups count to enable the group; then finally we
- * update the free space counts so that the system can start
- * using the new disk blocks.
- */
+ /*
+ * OK, now we've set up the new group. Time to make it active.
+ *
+ * Current kernels don't lock all allocations via lock_super(),
+ * so we have to be safe wrt. concurrent accesses the group
+ * data. So we need to be careful to set all of the relevant
+ * group descriptor data etc. *before* we enable the group.
+ *
+ * The key field here is sbi->s_groups_count: as long as
+ * that retains its old value, nobody is going to access the new
+ * group.
+ *
+ * So first we update all the descriptor metadata for the new
+ * group; then we update the total disk blocks count; then we
+ * update the groups count to enable the group; then finally we
+ * update the free space counts so that the system can start
+ * using the new disk blocks.
+ */
/* Update group descriptor block for new group */
gdp = (struct ext4_group_desc *)((char *)primary->b_data +
@@ -866,6 +867,14 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
/*
+ * We can allocate memory for mb_alloc based on the new group
+ * descriptor
+ */
+ err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
+ if (err)
+ goto exit_journal;
+
+ /*
* Make the new blocks and inodes valid next. We do this before
* increasing the group count so that once the group is enabled,
* all of its blocks and inodes are already valid.
@@ -919,6 +928,15 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
percpu_counter_add(&sbi->s_freeinodes_counter,
EXT4_INODES_PER_GROUP(sb));
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
+ ext4_group_t flex_group;
+ flex_group = ext4_flex_group(sbi, input->group);
+ sbi->s_flex_groups[flex_group].free_blocks +=
+ input->free_blocks_count;
+ sbi->s_flex_groups[flex_group].free_inodes +=
+ EXT4_INODES_PER_GROUP(sb);
+ }
+
ext4_journal_dirty_metadata(handle, sbi->s_sbh);
sb->s_dirt = 1;
@@ -937,7 +955,8 @@ exit_put:
return err;
} /* ext4_group_add */
-/* Extend the filesystem to the new number of blocks specified. This entry
+/*
+ * Extend the filesystem to the new number of blocks specified. This entry
* point is only used to extend the current filesystem to the end of the last
* existing group. It can be accessed via ioctl, or by "remount,resize=<size>"
* for emergencies (because it has no dependencies on reserved blocks).
@@ -953,10 +972,12 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
ext4_group_t o_groups_count;
ext4_grpblk_t last;
ext4_grpblk_t add;
- struct buffer_head * bh;
+ struct buffer_head *bh;
handle_t *handle;
int err;
unsigned long freed_blocks;
+ ext4_group_t group;
+ struct ext4_group_info *grp;
/* We don't need to worry about locking wrt other resizers just
* yet: we're going to revalidate es->s_blocks_count after
@@ -988,7 +1009,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
}
/* Handle the remaining blocks in the last group only. */
- ext4_get_group_no_and_offset(sb, o_blocks_count, NULL, &last);
+ ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
if (last == 0) {
ext4_warning(sb, __func__,
@@ -1013,7 +1034,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
o_blocks_count + add, add);
/* See if the device is actually as big as what was requested */
- bh = sb_bread(sb, o_blocks_count + add -1);
+ bh = sb_bread(sb, o_blocks_count + add - 1);
if (!bh) {
ext4_warning(sb, __func__,
"can't read last block, resize aborted");
@@ -1060,6 +1081,52 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
o_blocks_count + add);
if ((err = ext4_journal_stop(handle)))
goto exit_put;
+
+ /*
+ * Mark mballoc pages as not up to date so that they will be updated
+ * next time they are loaded by ext4_mb_load_buddy.
+ *
+ * XXX Bad, Bad, BAD!!! We should not be overloading the
+ * Uptodate flag, particularly on thte bitmap bh, as way of
+ * hinting to ext4_mb_load_buddy() that it needs to be
+ * overloaded. A user could take a LVM snapshot, then do an
+ * on-line fsck, and clear the uptodate flag, and this would
+ * not be a bug in userspace, but a bug in the kernel. FIXME!!!
+ */
+ {
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct inode *inode = sbi->s_buddy_cache;
+ int blocks_per_page;
+ int block;
+ int pnum;
+ struct page *page;
+
+ /* Set buddy page as not up to date */
+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+ block = group * 2;
+ pnum = block / blocks_per_page;
+ page = find_get_page(inode->i_mapping, pnum);
+ if (page != NULL) {
+ ClearPageUptodate(page);
+ page_cache_release(page);
+ }
+
+ /* Set bitmap page as not up to date */
+ block++;
+ pnum = block / blocks_per_page;
+ page = find_get_page(inode->i_mapping, pnum);
+ if (page != NULL) {
+ ClearPageUptodate(page);
+ page_cache_release(page);
+ }
+
+ /* Get the info on the last group */
+ grp = ext4_get_group_info(sb, group);
+
+ /* Update free blocks in group info */
+ ext4_mb_update_group_info(grp, add);
+ }
+
if (test_opt(sb, DEBUG))
printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
ext4_blocks_count(es));
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index cb96f127c36..dea8f13c2fd 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -34,6 +34,8 @@
#include <linux/namei.h>
#include <linux/quotaops.h>
#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include <linux/marker.h>
#include <linux/log2.h>
#include <linux/crc16.h>
#include <asm/uaccess.h>
@@ -45,24 +47,25 @@
#include "namei.h"
#include "group.h"
+struct proc_dir_entry *ext4_proc_root;
+
static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
unsigned long journal_devnum);
static int ext4_create_journal(struct super_block *, struct ext4_super_block *,
unsigned int);
-static void ext4_commit_super (struct super_block * sb,
- struct ext4_super_block * es,
- int sync);
-static void ext4_mark_recovery_complete(struct super_block * sb,
- struct ext4_super_block * es);
-static void ext4_clear_journal_err(struct super_block * sb,
- struct ext4_super_block * es);
+static void ext4_commit_super(struct super_block *sb,
+ struct ext4_super_block *es, int sync);
+static void ext4_mark_recovery_complete(struct super_block *sb,
+ struct ext4_super_block *es);
+static void ext4_clear_journal_err(struct super_block *sb,
+ struct ext4_super_block *es);
static int ext4_sync_fs(struct super_block *sb, int wait);
-static const char *ext4_decode_error(struct super_block * sb, int errno,
+static const char *ext4_decode_error(struct super_block *sb, int errno,
char nbuf[16]);
-static int ext4_remount (struct super_block * sb, int * flags, char * data);
-static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf);
+static int ext4_remount(struct super_block *sb, int *flags, char *data);
+static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
static void ext4_unlockfs(struct super_block *sb);
-static void ext4_write_super (struct super_block * sb);
+static void ext4_write_super(struct super_block *sb);
static void ext4_write_super_lockfs(struct super_block *sb);
@@ -211,15 +214,15 @@ static void ext4_handle_error(struct super_block *sb)
if (sb->s_flags & MS_RDONLY)
return;
- if (!test_opt (sb, ERRORS_CONT)) {
+ if (!test_opt(sb, ERRORS_CONT)) {
journal_t *journal = EXT4_SB(sb)->s_journal;
EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
if (journal)
jbd2_journal_abort(journal, -EIO);
}
- if (test_opt (sb, ERRORS_RO)) {
- printk (KERN_CRIT "Remounting filesystem read-only\n");
+ if (test_opt(sb, ERRORS_RO)) {
+ printk(KERN_CRIT "Remounting filesystem read-only\n");
sb->s_flags |= MS_RDONLY;
}
ext4_commit_super(sb, es, 1);
@@ -228,13 +231,13 @@ static void ext4_handle_error(struct super_block *sb)
sb->s_id);
}
-void ext4_error (struct super_block * sb, const char * function,
- const char * fmt, ...)
+void ext4_error(struct super_block *sb, const char *function,
+ const char *fmt, ...)
{
va_list args;
va_start(args, fmt);
- printk(KERN_CRIT "EXT4-fs error (device %s): %s: ",sb->s_id, function);
+ printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
vprintk(fmt, args);
printk("\n");
va_end(args);
@@ -242,7 +245,7 @@ void ext4_error (struct super_block * sb, const char * function,
ext4_handle_error(sb);
}
-static const char *ext4_decode_error(struct super_block * sb, int errno,
+static const char *ext4_decode_error(struct super_block *sb, int errno,
char nbuf[16])
{
char *errstr = NULL;
@@ -278,8 +281,7 @@ static const char *ext4_decode_error(struct super_block * sb, int errno,
/* __ext4_std_error decodes expected errors from journaling functions
* automatically and invokes the appropriate error response. */
-void __ext4_std_error (struct super_block * sb, const char * function,
- int errno)
+void __ext4_std_error(struct super_block *sb, const char *function, int errno)
{
char nbuf[16];
const char *errstr;
@@ -292,8 +294,8 @@ void __ext4_std_error (struct super_block * sb, const char * function,
return;
errstr = ext4_decode_error(sb, errno, nbuf);
- printk (KERN_CRIT "EXT4-fs error (device %s) in %s: %s\n",
- sb->s_id, function, errstr);
+ printk(KERN_CRIT "EXT4-fs error (device %s) in %s: %s\n",
+ sb->s_id, function, errstr);
ext4_handle_error(sb);
}
@@ -308,15 +310,15 @@ void __ext4_std_error (struct super_block * sb, const char * function,
* case we take the easy way out and panic immediately.
*/
-void ext4_abort (struct super_block * sb, const char * function,
- const char * fmt, ...)
+void ext4_abort(struct super_block *sb, const char *function,
+ const char *fmt, ...)
{
va_list args;
- printk (KERN_CRIT "ext4_abort called.\n");
+ printk(KERN_CRIT "ext4_abort called.\n");
va_start(args, fmt);
- printk(KERN_CRIT "EXT4-fs error (device %s): %s: ",sb->s_id, function);
+ printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
vprintk(fmt, args);
printk("\n");
va_end(args);
@@ -334,8 +336,8 @@ void ext4_abort (struct super_block * sb, const char * function,
jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
}
-void ext4_warning (struct super_block * sb, const char * function,
- const char * fmt, ...)
+void ext4_warning(struct super_block *sb, const char *function,
+ const char *fmt, ...)
{
va_list args;
@@ -496,7 +498,7 @@ static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
}
}
-static void ext4_put_super (struct super_block * sb)
+static void ext4_put_super(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
@@ -505,21 +507,27 @@ static void ext4_put_super (struct super_block * sb)
ext4_mb_release(sb);
ext4_ext_release(sb);
ext4_xattr_put_super(sb);
- jbd2_journal_destroy(sbi->s_journal);
+ if (jbd2_journal_destroy(sbi->s_journal) < 0)
+ ext4_abort(sb, __func__, "Couldn't clean up the journal");
+ sbi->s_journal = NULL;
if (!(sb->s_flags & MS_RDONLY)) {
EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
es->s_state = cpu_to_le16(sbi->s_mount_state);
- BUFFER_TRACE(sbi->s_sbh, "marking dirty");
- mark_buffer_dirty(sbi->s_sbh);
ext4_commit_super(sb, es, 1);
}
+ if (sbi->s_proc) {
+ remove_proc_entry("inode_readahead_blks", sbi->s_proc);
+ remove_proc_entry(sb->s_id, ext4_proc_root);
+ }
for (i = 0; i < sbi->s_gdb_count; i++)
brelse(sbi->s_group_desc[i]);
kfree(sbi->s_group_desc);
+ kfree(sbi->s_flex_groups);
percpu_counter_destroy(&sbi->s_freeblocks_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
+ percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
brelse(sbi->s_sbh);
#ifdef CONFIG_QUOTA
for (i = 0; i < MAXQUOTAS; i++)
@@ -562,15 +570,21 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
if (!ei)
return NULL;
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
ei->i_acl = EXT4_ACL_NOT_CACHED;
ei->i_default_acl = EXT4_ACL_NOT_CACHED;
#endif
- ei->i_block_alloc_info = NULL;
ei->vfs_inode.i_version = 1;
+ ei->vfs_inode.i_data.writeback_index = 0;
memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
INIT_LIST_HEAD(&ei->i_prealloc_list);
spin_lock_init(&ei->i_prealloc_lock);
+ jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
+ ei->i_reserved_data_blocks = 0;
+ ei->i_reserved_meta_blocks = 0;
+ ei->i_allocated_meta_blocks = 0;
+ ei->i_delalloc_reserved_flag = 0;
+ spin_lock_init(&(ei->i_block_reservation_lock));
return &ei->vfs_inode;
}
@@ -587,12 +601,12 @@ static void ext4_destroy_inode(struct inode *inode)
kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
}
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
{
struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
INIT_LIST_HEAD(&ei->i_orphan);
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
init_rwsem(&ei->xattr_sem);
#endif
init_rwsem(&ei->i_data_sem);
@@ -618,8 +632,7 @@ static void destroy_inodecache(void)
static void ext4_clear_inode(struct inode *inode)
{
- struct ext4_block_alloc_info *rsv = EXT4_I(inode)->i_block_alloc_info;
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
if (EXT4_I(inode)->i_acl &&
EXT4_I(inode)->i_acl != EXT4_ACL_NOT_CACHED) {
posix_acl_release(EXT4_I(inode)->i_acl);
@@ -631,20 +644,20 @@ static void ext4_clear_inode(struct inode *inode)
EXT4_I(inode)->i_default_acl = EXT4_ACL_NOT_CACHED;
}
#endif
- ext4_discard_reservation(inode);
- EXT4_I(inode)->i_block_alloc_info = NULL;
- if (unlikely(rsv))
- kfree(rsv);
+ ext4_discard_preallocations(inode);
+ jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
+ &EXT4_I(inode)->jinode);
}
-static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb)
+static inline void ext4_show_quota_options(struct seq_file *seq,
+ struct super_block *sb)
{
#if defined(CONFIG_QUOTA)
struct ext4_sb_info *sbi = EXT4_SB(sb);
if (sbi->s_jquota_fmt)
seq_printf(seq, ",jqfmt=%s",
- (sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold": "vfsv0");
+ (sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold" : "vfsv0");
if (sbi->s_qf_names[USRQUOTA])
seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
@@ -671,7 +684,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
unsigned long def_mount_opts;
struct super_block *sb = vfs->mnt_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- journal_t *journal = sbi->s_journal;
struct ext4_super_block *es = sbi->s_es;
def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
@@ -709,7 +721,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
seq_puts(seq, ",debug");
if (test_opt(sb, OLDALLOC))
seq_puts(seq, ",oldalloc");
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
if (test_opt(sb, XATTR_USER) &&
!(def_mount_opts & EXT4_DEFM_XATTR_USER))
seq_puts(seq, ",user_xattr");
@@ -718,7 +730,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
seq_puts(seq, ",nouser_xattr");
}
#endif
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL))
seq_puts(seq, ",acl");
if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
@@ -743,10 +755,11 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
seq_puts(seq, ",nobh");
if (!test_opt(sb, EXTENTS))
seq_puts(seq, ",noextents");
- if (!test_opt(sb, MBALLOC))
- seq_puts(seq, ",nomballoc");
if (test_opt(sb, I_VERSION))
seq_puts(seq, ",i_version");
+ if (!test_opt(sb, DELALLOC))
+ seq_puts(seq, ",nodelalloc");
+
if (sbi->s_stripe)
seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
@@ -761,6 +774,13 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
seq_puts(seq, ",data=writeback");
+ if (sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
+ seq_printf(seq, ",inode_readahead_blks=%u",
+ sbi->s_inode_readahead_blks);
+
+ if (test_opt(sb, DATA_ERR_ABORT))
+ seq_puts(seq, ",data_err=abort");
+
ext4_show_quota_options(seq, sb);
return 0;
}
@@ -810,8 +830,8 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
}
#ifdef CONFIG_QUOTA
-#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
-#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
+#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
+#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
static int ext4_dquot_initialize(struct inode *inode, int type);
static int ext4_dquot_drop(struct inode *inode);
@@ -890,14 +910,16 @@ enum {
Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
Opt_journal_checksum, Opt_journal_async_commit,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+ Opt_data_err_abort, Opt_data_err_ignore,
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
- Opt_mballoc, Opt_nomballoc, Opt_stripe,
+ Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
+ Opt_inode_readahead_blks
};
-static match_table_t tokens = {
+static const match_table_t tokens = {
{Opt_bsd_df, "bsddf"},
{Opt_minix_df, "minixdf"},
{Opt_grpid, "grpid"},
@@ -935,6 +957,8 @@ static match_table_t tokens = {
{Opt_data_journal, "data=journal"},
{Opt_data_ordered, "data=ordered"},
{Opt_data_writeback, "data=writeback"},
+ {Opt_data_err_abort, "data_err=abort"},
+ {Opt_data_err_ignore, "data_err=ignore"},
{Opt_offusrjquota, "usrjquota="},
{Opt_usrjquota, "usrjquota=%s"},
{Opt_offgrpjquota, "grpjquota="},
@@ -953,6 +977,9 @@ static match_table_t tokens = {
{Opt_nomballoc, "nomballoc"},
{Opt_stripe, "stripe=%u"},
{Opt_resize, "resize"},
+ {Opt_delalloc, "delalloc"},
+ {Opt_nodelalloc, "nodelalloc"},
+ {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
{Opt_err, NULL},
};
@@ -967,7 +994,7 @@ static ext4_fsblk_t get_sb_block(void **data)
/*todo: use simple_strtoll with >32bit ext4 */
sb_block = simple_strtoul(options, &options, 0);
if (*options && *options != ',') {
- printk("EXT4-fs: Invalid sb specification: %s\n",
+ printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n",
(char *) *data);
return 1;
}
@@ -977,12 +1004,12 @@ static ext4_fsblk_t get_sb_block(void **data)
return sb_block;
}
-static int parse_options (char *options, struct super_block *sb,
- unsigned int *inum, unsigned long *journal_devnum,
- ext4_fsblk_t *n_blocks_count, int is_remount)
+static int parse_options(char *options, struct super_block *sb,
+ unsigned int *inum, unsigned long *journal_devnum,
+ ext4_fsblk_t *n_blocks_count, int is_remount)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- char * p;
+ char *p;
substring_t args[MAX_OPT_ARGS];
int data_opt = 0;
int option;
@@ -990,11 +1017,12 @@ static int parse_options (char *options, struct super_block *sb,
int qtype, qfmt;
char *qname;
#endif
+ ext4_fsblk_t last_block;
if (!options)
return 1;
- while ((p = strsep (&options, ",")) != NULL) {
+ while ((p = strsep(&options, ",")) != NULL) {
int token;
if (!*p)
continue;
@@ -1002,16 +1030,16 @@ static int parse_options (char *options, struct super_block *sb,
token = match_token(p, tokens, args);
switch (token) {
case Opt_bsd_df:
- clear_opt (sbi->s_mount_opt, MINIX_DF);
+ clear_opt(sbi->s_mount_opt, MINIX_DF);
break;
case Opt_minix_df:
- set_opt (sbi->s_mount_opt, MINIX_DF);
+ set_opt(sbi->s_mount_opt, MINIX_DF);
break;
case Opt_grpid:
- set_opt (sbi->s_mount_opt, GRPID);
+ set_opt(sbi->s_mount_opt, GRPID);
break;
case Opt_nogrpid:
- clear_opt (sbi->s_mount_opt, GRPID);
+ clear_opt(sbi->s_mount_opt, GRPID);
break;
case Opt_resuid:
if (match_int(&args[0], &option))
@@ -1028,49 +1056,50 @@ static int parse_options (char *options, struct super_block *sb,
/* *sb_block = match_int(&args[0]); */
break;
case Opt_err_panic:
- clear_opt (sbi->s_mount_opt, ERRORS_CONT);
- clear_opt (sbi->s_mount_opt, ERRORS_RO);
- set_opt (sbi->s_mount_opt, ERRORS_PANIC);
+ clear_opt(sbi->s_mount_opt, ERRORS_CONT);
+ clear_opt(sbi->s_mount_opt, ERRORS_RO);
+ set_opt(sbi->s_mount_opt, ERRORS_PANIC);
break;
case Opt_err_ro:
- clear_opt (sbi->s_mount_opt, ERRORS_CONT);
- clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
- set_opt (sbi->s_mount_opt, ERRORS_RO);
+ clear_opt(sbi->s_mount_opt, ERRORS_CONT);
+ clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
+ set_opt(sbi->s_mount_opt, ERRORS_RO);
break;
case Opt_err_cont:
- clear_opt (sbi->s_mount_opt, ERRORS_RO);
- clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
- set_opt (sbi->s_mount_opt, ERRORS_CONT);
+ clear_opt(sbi->s_mount_opt, ERRORS_RO);
+ clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
+ set_opt(sbi->s_mount_opt, ERRORS_CONT);
break;
case Opt_nouid32:
- set_opt (sbi->s_mount_opt, NO_UID32);
+ set_opt(sbi->s_mount_opt, NO_UID32);
break;
case Opt_nocheck:
- clear_opt (sbi->s_mount_opt, CHECK);
+ clear_opt(sbi->s_mount_opt, CHECK);
break;
case Opt_debug:
- set_opt (sbi->s_mount_opt, DEBUG);
+ set_opt(sbi->s_mount_opt, DEBUG);
break;
case Opt_oldalloc:
- set_opt (sbi->s_mount_opt, OLDALLOC);
+ set_opt(sbi->s_mount_opt, OLDALLOC);
break;
case Opt_orlov:
- clear_opt (sbi->s_mount_opt, OLDALLOC);
+ clear_opt(sbi->s_mount_opt, OLDALLOC);
break;
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
case Opt_user_xattr:
- set_opt (sbi->s_mount_opt, XATTR_USER);
+ set_opt(sbi->s_mount_opt, XATTR_USER);
break;
case Opt_nouser_xattr:
- clear_opt (sbi->s_mount_opt, XATTR_USER);
+ clear_opt(sbi->s_mount_opt, XATTR_USER);
break;
#else
case Opt_user_xattr:
case Opt_nouser_xattr:
- printk("EXT4 (no)user_xattr options not supported\n");
+ printk(KERN_ERR "EXT4 (no)user_xattr options "
+ "not supported\n");
break;
#endif
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
case Opt_acl:
set_opt(sbi->s_mount_opt, POSIX_ACL);
break;
@@ -1080,7 +1109,8 @@ static int parse_options (char *options, struct super_block *sb,
#else
case Opt_acl:
case Opt_noacl:
- printk("EXT4 (no)acl options not supported\n");
+ printk(KERN_ERR "EXT4 (no)acl options "
+ "not supported\n");
break;
#endif
case Opt_reservation:
@@ -1100,7 +1130,7 @@ static int parse_options (char *options, struct super_block *sb,
"journal on remount\n");
return 0;
}
- set_opt (sbi->s_mount_opt, UPDATE_JOURNAL);
+ set_opt(sbi->s_mount_opt, UPDATE_JOURNAL);
break;
case Opt_journal_inum:
if (is_remount) {
@@ -1130,7 +1160,7 @@ static int parse_options (char *options, struct super_block *sb,
set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
break;
case Opt_noload:
- set_opt (sbi->s_mount_opt, NOLOAD);
+ set_opt(sbi->s_mount_opt, NOLOAD);
break;
case Opt_commit:
if (match_int(&args[0], &option))
@@ -1163,6 +1193,12 @@ static int parse_options (char *options, struct super_block *sb,
sbi->s_mount_opt |= data_opt;
}
break;
+ case Opt_data_err_abort:
+ set_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+ break;
+ case Opt_data_err_ignore:
+ clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+ break;
#ifdef CONFIG_QUOTA
case Opt_usrjquota:
qtype = USRQUOTA;
@@ -1174,8 +1210,8 @@ set_qf_name:
sb_any_quota_suspended(sb)) &&
!sbi->s_qf_names[qtype]) {
printk(KERN_ERR
- "EXT4-fs: Cannot change journaled "
- "quota options when quota turned on.\n");
+ "EXT4-fs: Cannot change journaled "
+ "quota options when quota turned on.\n");
return 0;
}
qname = match_strdup(&args[0]);
@@ -1309,20 +1345,38 @@ set_qf_format:
clear_opt(sbi->s_mount_opt, NOBH);
break;
case Opt_extents:
- set_opt (sbi->s_mount_opt, EXTENTS);
+ if (!EXT4_HAS_INCOMPAT_FEATURE(sb,
+ EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+ ext4_warning(sb, __func__,
+ "extents feature not enabled "
+ "on this filesystem, use tune2fs\n");
+ return 0;
+ }
+ set_opt(sbi->s_mount_opt, EXTENTS);
break;
case Opt_noextents:
- clear_opt (sbi->s_mount_opt, EXTENTS);
+ /*
+ * When e2fsprogs support resizing an already existing
+ * ext3 file system to greater than 2**32 we need to
+ * add support to block allocator to handle growing
+ * already existing block mapped inode so that blocks
+ * allocated for them fall within 2**32
+ */
+ last_block = ext4_blocks_count(sbi->s_es) - 1;
+ if (last_block > 0xffffffffULL) {
+ printk(KERN_ERR "EXT4-fs: Filesystem too "
+ "large to mount with "
+ "-o noextents options\n");
+ return 0;
+ }
+ clear_opt(sbi->s_mount_opt, EXTENTS);
break;
case Opt_i_version:
set_opt(sbi->s_mount_opt, I_VERSION);
sb->s_flags |= MS_I_VERSION;
break;
- case Opt_mballoc:
- set_opt(sbi->s_mount_opt, MBALLOC);
- break;
- case Opt_nomballoc:
- clear_opt(sbi->s_mount_opt, MBALLOC);
+ case Opt_nodelalloc:
+ clear_opt(sbi->s_mount_opt, DELALLOC);
break;
case Opt_stripe:
if (match_int(&args[0], &option))
@@ -1331,10 +1385,20 @@ set_qf_format:
return 0;
sbi->s_stripe = option;
break;
+ case Opt_delalloc:
+ set_opt(sbi->s_mount_opt, DELALLOC);
+ break;
+ case Opt_inode_readahead_blks:
+ if (match_int(&args[0], &option))
+ return 0;
+ if (option < 0 || option > (1 << 30))
+ return 0;
+ sbi->s_inode_readahead_blks = option;
+ break;
default:
- printk (KERN_ERR
- "EXT4-fs: Unrecognized mount option \"%s\" "
- "or missing value\n", p);
+ printk(KERN_ERR
+ "EXT4-fs: Unrecognized mount option \"%s\" "
+ "or missing value\n", p);
return 0;
}
}
@@ -1381,31 +1445,31 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
int res = 0;
if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
- printk (KERN_ERR "EXT4-fs warning: revision level too high, "
- "forcing read-only mode\n");
+ printk(KERN_ERR "EXT4-fs warning: revision level too high, "
+ "forcing read-only mode\n");
res = MS_RDONLY;
}
if (read_only)
return res;
if (!(sbi->s_mount_state & EXT4_VALID_FS))
- printk (KERN_WARNING "EXT4-fs warning: mounting unchecked fs, "
- "running e2fsck is recommended\n");
+ printk(KERN_WARNING "EXT4-fs warning: mounting unchecked fs, "
+ "running e2fsck is recommended\n");
else if ((sbi->s_mount_state & EXT4_ERROR_FS))
- printk (KERN_WARNING
- "EXT4-fs warning: mounting fs with errors, "
- "running e2fsck is recommended\n");
+ printk(KERN_WARNING
+ "EXT4-fs warning: mounting fs with errors, "
+ "running e2fsck is recommended\n");
else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
le16_to_cpu(es->s_mnt_count) >=
(unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
- printk (KERN_WARNING
- "EXT4-fs warning: maximal mount count reached, "
- "running e2fsck is recommended\n");
+ printk(KERN_WARNING
+ "EXT4-fs warning: maximal mount count reached, "
+ "running e2fsck is recommended\n");
else if (le32_to_cpu(es->s_checkinterval) &&
(le32_to_cpu(es->s_lastcheck) +
le32_to_cpu(es->s_checkinterval) <= get_seconds()))
- printk (KERN_WARNING
- "EXT4-fs warning: checktime reached, "
- "running e2fsck is recommended\n");
+ printk(KERN_WARNING
+ "EXT4-fs warning: checktime reached, "
+ "running e2fsck is recommended\n");
#if 0
/* @@@ We _will_ want to clear the valid bit if we find
* inconsistencies, to force a fsck at reboot. But for
@@ -1431,16 +1495,60 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
EXT4_INODES_PER_GROUP(sb),
sbi->s_mount_opt);
- printk(KERN_INFO "EXT4 FS on %s, ", sb->s_id);
- if (EXT4_SB(sb)->s_journal->j_inode == NULL) {
- char b[BDEVNAME_SIZE];
+ printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n",
+ sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" :
+ "external", EXT4_SB(sb)->s_journal->j_devname);
+ return res;
+}
- printk("external journal on %s\n",
- bdevname(EXT4_SB(sb)->s_journal->j_dev, b));
- } else {
- printk("internal journal\n");
+static int ext4_fill_flex_info(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_group_desc *gdp = NULL;
+ struct buffer_head *bh;
+ ext4_group_t flex_group_count;
+ ext4_group_t flex_group;
+ int groups_per_flex = 0;
+ __u64 block_bitmap = 0;
+ int i;
+
+ if (!sbi->s_es->s_log_groups_per_flex) {
+ sbi->s_log_groups_per_flex = 0;
+ return 1;
}
- return res;
+
+ sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
+ groups_per_flex = 1 << sbi->s_log_groups_per_flex;
+
+ /* We allocate both existing and potentially added groups */
+ flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
+ ((sbi->s_es->s_reserved_gdt_blocks +1 ) <<
+ EXT4_DESC_PER_BLOCK_BITS(sb))) /
+ groups_per_flex;
+ sbi->s_flex_groups = kzalloc(flex_group_count *
+ sizeof(struct flex_groups), GFP_KERNEL);
+ if (sbi->s_flex_groups == NULL) {
+ printk(KERN_ERR "EXT4-fs: not enough memory for "
+ "%lu flex groups\n", flex_group_count);
+ goto failed;
+ }
+
+ gdp = ext4_get_group_desc(sb, 1, &bh);
+ block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
+
+ for (i = 0; i < sbi->s_groups_count; i++) {
+ gdp = ext4_get_group_desc(sb, i, &bh);
+
+ flex_group = ext4_flex_group(sbi, i);
+ sbi->s_flex_groups[flex_group].free_inodes +=
+ le16_to_cpu(gdp->bg_free_inodes_count);
+ sbi->s_flex_groups[flex_group].free_blocks +=
+ le16_to_cpu(gdp->bg_free_blocks_count);
+ }
+
+ return 1;
+failed:
+ return 0;
}
__le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
@@ -1495,7 +1603,7 @@ static int ext4_check_descriptors(struct super_block *sb)
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
flexbg_flag = 1;
- ext4_debug ("Checking group descriptors");
+ ext4_debug("Checking group descriptors");
for (i = 0; i < sbi->s_groups_count; i++) {
struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
@@ -1507,16 +1615,14 @@ static int ext4_check_descriptors(struct super_block *sb)
(EXT4_BLOCKS_PER_GROUP(sb) - 1);
block_bitmap = ext4_block_bitmap(sb, gdp);
- if (block_bitmap < first_block || block_bitmap > last_block)
- {
+ if (block_bitmap < first_block || block_bitmap > last_block) {
printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
"Block bitmap for group %lu not in group "
"(block %llu)!", i, block_bitmap);
return 0;
}
inode_bitmap = ext4_inode_bitmap(sb, gdp);
- if (inode_bitmap < first_block || inode_bitmap > last_block)
- {
+ if (inode_bitmap < first_block || inode_bitmap > last_block) {
printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
"Inode bitmap for group %lu not in group "
"(block %llu)!", i, inode_bitmap);
@@ -1524,26 +1630,30 @@ static int ext4_check_descriptors(struct super_block *sb)
}
inode_table = ext4_inode_table(sb, gdp);
if (inode_table < first_block ||
- inode_table + sbi->s_itb_per_group - 1 > last_block)
- {
+ inode_table + sbi->s_itb_per_group - 1 > last_block) {
printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
"Inode table for group %lu not in group "
"(block %llu)!", i, inode_table);
return 0;
}
+ spin_lock(sb_bgl_lock(sbi, i));
if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
"Checksum for group %lu failed (%u!=%u)\n",
i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
gdp)), le16_to_cpu(gdp->bg_checksum));
- return 0;
+ if (!(sb->s_flags & MS_RDONLY)) {
+ spin_unlock(sb_bgl_lock(sbi, i));
+ return 0;
+ }
}
+ spin_unlock(sb_bgl_lock(sbi, i));
if (!flexbg_flag)
first_block += EXT4_BLOCKS_PER_GROUP(sb);
}
ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
- sbi->s_es->s_free_inodes_count=cpu_to_le32(ext4_count_free_inodes(sb));
+ sbi->s_es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb));
return 1;
}
@@ -1564,8 +1674,8 @@ static int ext4_check_descriptors(struct super_block *sb)
* e2fsck was run on this filesystem, and it must have already done the orphan
* inode cleanup for us, so we can safely abort without any further action.
*/
-static void ext4_orphan_cleanup (struct super_block * sb,
- struct ext4_super_block * es)
+static void ext4_orphan_cleanup(struct super_block *sb,
+ struct ext4_super_block *es)
{
unsigned int s_flags = sb->s_flags;
int nr_orphans = 0, nr_truncates = 0;
@@ -1625,9 +1735,9 @@ static void ext4_orphan_cleanup (struct super_block * sb,
DQUOT_INIT(inode);
if (inode->i_nlink) {
printk(KERN_DEBUG
- "%s: truncating inode %lu to %Ld bytes\n",
+ "%s: truncating inode %lu to %lld bytes\n",
__func__, inode->i_ino, inode->i_size);
- jbd_debug(2, "truncating inode %lu to %Ld bytes\n",
+ jbd_debug(2, "truncating inode %lu to %lld bytes\n",
inode->i_ino, inode->i_size);
ext4_truncate(inode);
nr_truncates++;
@@ -1642,7 +1752,7 @@ static void ext4_orphan_cleanup (struct super_block * sb,
iput(inode); /* The delete magic happens here! */
}
-#define PLURAL(x) (x), ((x)==1) ? "" : "s"
+#define PLURAL(x) (x), ((x) == 1) ? "" : "s"
if (nr_orphans)
printk(KERN_INFO "EXT4-fs: %s: %d orphan inode%s deleted\n",
@@ -1809,12 +1919,12 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
return 0;
}
-static int ext4_fill_super (struct super_block *sb, void *data, int silent)
- __releases(kernel_sem)
- __acquires(kernel_sem)
+static int ext4_fill_super(struct super_block *sb, void *data, int silent)
+ __releases(kernel_lock)
+ __acquires(kernel_lock)
{
- struct buffer_head * bh;
+ struct buffer_head *bh;
struct ext4_super_block *es = NULL;
struct ext4_sb_info *sbi;
ext4_fsblk_t block;
@@ -1825,6 +1935,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
unsigned long journal_devnum = 0;
unsigned long def_mount_opts;
struct inode *root;
+ char *cp;
int ret = -EINVAL;
int blocksize;
int db_count;
@@ -1841,21 +1952,21 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
sbi->s_mount_opt = 0;
sbi->s_resuid = EXT4_DEF_RESUID;
sbi->s_resgid = EXT4_DEF_RESGID;
+ sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
sbi->s_sb_block = sb_block;
unlock_kernel();
+ /* Cleanup superblock name */
+ for (cp = sb->s_id; (cp = strchr(cp, '/'));)
+ *cp = '!';
+
blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
if (!blocksize) {
printk(KERN_ERR "EXT4-fs: unable to set blocksize\n");
goto out_fail;
}
- if (!sb_set_blocksize(sb, blocksize)) {
- printk(KERN_ERR "EXT4-fs: bad blocksize %d.\n", blocksize);
- goto out_fail;
- }
-
/*
* The ext4 superblock will not be buffer aligned for other than 1kB
* block sizes. We need to calculate the offset from buffer start.
@@ -1868,7 +1979,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
}
if (!(bh = sb_bread(sb, logical_sb_block))) {
- printk (KERN_ERR "EXT4-fs: unable to read superblock\n");
+ printk(KERN_ERR "EXT4-fs: unable to read superblock\n");
goto out_fail;
}
/*
@@ -1889,11 +2000,11 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
set_opt(sbi->s_mount_opt, GRPID);
if (def_mount_opts & EXT4_DEFM_UID16)
set_opt(sbi->s_mount_opt, NO_UID32);
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
if (def_mount_opts & EXT4_DEFM_XATTR_USER)
set_opt(sbi->s_mount_opt, XATTR_USER);
#endif
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
if (def_mount_opts & EXT4_DEFM_ACL)
set_opt(sbi->s_mount_opt, POSIX_ACL);
#endif
@@ -1919,17 +2030,25 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
/*
* turn on extents feature by default in ext4 filesystem
- * User -o noextents to turn it off
+ * only if feature flag already set by mkfs or tune2fs.
+ * Use -o noextents to turn it off
*/
- set_opt(sbi->s_mount_opt, EXTENTS);
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
+ set_opt(sbi->s_mount_opt, EXTENTS);
+ else
+ ext4_warning(sb, __func__,
+ "extents feature not enabled on this filesystem, "
+ "use tune2fs.\n");
+
/*
- * turn on mballoc feature by default in ext4 filesystem
- * User -o nomballoc to turn it off
+ * enable delayed allocation by default
+ * Use -o nodelalloc to turn it off
*/
- set_opt(sbi->s_mount_opt, MBALLOC);
+ set_opt(sbi->s_mount_opt, DELALLOC);
- if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
- NULL, 0))
+
+ if (!parse_options((char *) data, sb, &journal_inum, &journal_devnum,
+ NULL, 0))
goto failed_mount;
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
@@ -1944,16 +2063,6 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
"running e2fsck is recommended\n");
/*
- * Since ext4 is still considered development code, we require
- * that the TEST_FILESYS flag in s->flags be set.
- */
- if (!(le32_to_cpu(es->s_flags) & EXT2_FLAGS_TEST_FILESYS)) {
- printk(KERN_WARNING "EXT4-fs: %s: not marked "
- "OK to use with test code.\n", sb->s_id);
- goto failed_mount;
- }
-
- /*
* Check feature flags regardless of the revision level, since we
* previously didn't change the revision level when setting the flags,
* so there is a chance incompat flags are set on a rev 0 filesystem.
@@ -2004,7 +2113,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
goto failed_mount;
}
- brelse (bh);
+ brelse(bh);
logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
offset = do_div(logical_sb_block, blocksize);
bh = sb_bread(sb, logical_sb_block);
@@ -2016,8 +2125,8 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
es = (struct ext4_super_block *)(((char *)bh->b_data) + offset);
sbi->s_es = es;
if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
- printk (KERN_ERR
- "EXT4-fs: Magic mismatch, very weird !\n");
+ printk(KERN_ERR
+ "EXT4-fs: Magic mismatch, very weird !\n");
goto failed_mount;
}
}
@@ -2034,9 +2143,9 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
(!is_power_of_2(sbi->s_inode_size)) ||
(sbi->s_inode_size > blocksize)) {
- printk (KERN_ERR
- "EXT4-fs: unsupported inode size: %d\n",
- sbi->s_inode_size);
+ printk(KERN_ERR
+ "EXT4-fs: unsupported inode size: %d\n",
+ sbi->s_inode_size);
goto failed_mount;
}
if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE)
@@ -2068,20 +2177,20 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
sbi->s_mount_state = le16_to_cpu(es->s_state);
sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
- for (i=0; i < 4; i++)
+ for (i = 0; i < 4; i++)
sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
sbi->s_def_hash_version = es->s_def_hash_version;
if (sbi->s_blocks_per_group > blocksize * 8) {
- printk (KERN_ERR
- "EXT4-fs: #blocks per group too big: %lu\n",
- sbi->s_blocks_per_group);
+ printk(KERN_ERR
+ "EXT4-fs: #blocks per group too big: %lu\n",
+ sbi->s_blocks_per_group);
goto failed_mount;
}
if (sbi->s_inodes_per_group > blocksize * 8) {
- printk (KERN_ERR
- "EXT4-fs: #inodes per group too big: %lu\n",
- sbi->s_inodes_per_group);
+ printk(KERN_ERR
+ "EXT4-fs: #inodes per group too big: %lu\n",
+ sbi->s_inodes_per_group);
goto failed_mount;
}
@@ -2115,29 +2224,47 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
sbi->s_groups_count = blocks_count;
db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
EXT4_DESC_PER_BLOCK(sb);
- sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
+ sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
GFP_KERNEL);
if (sbi->s_group_desc == NULL) {
- printk (KERN_ERR "EXT4-fs: not enough memory\n");
+ printk(KERN_ERR "EXT4-fs: not enough memory\n");
goto failed_mount;
}
+#ifdef CONFIG_PROC_FS
+ if (ext4_proc_root)
+ sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
+
+ if (sbi->s_proc)
+ proc_create_data("inode_readahead_blks", 0644, sbi->s_proc,
+ &ext4_ui_proc_fops,
+ &sbi->s_inode_readahead_blks);
+#endif
+
bgl_lock_init(&sbi->s_blockgroup_lock);
for (i = 0; i < db_count; i++) {
block = descriptor_loc(sb, logical_sb_block, i);
sbi->s_group_desc[i] = sb_bread(sb, block);
if (!sbi->s_group_desc[i]) {
- printk (KERN_ERR "EXT4-fs: "
- "can't read group descriptor %d\n", i);
+ printk(KERN_ERR "EXT4-fs: "
+ "can't read group descriptor %d\n", i);
db_count = i;
goto failed_mount2;
}
}
- if (!ext4_check_descriptors (sb)) {
+ if (!ext4_check_descriptors(sb)) {
printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
goto failed_mount2;
}
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+ if (!ext4_fill_flex_info(sb)) {
+ printk(KERN_ERR
+ "EXT4-fs: unable to initialize "
+ "flex_bg meta info!\n");
+ goto failed_mount2;
+ }
+
sbi->s_gdb_count = db_count;
get_random_bytes(&sbi->s_next_generation, sizeof(u32));
spin_lock_init(&sbi->s_next_gen_lock);
@@ -2152,24 +2279,14 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
err = percpu_counter_init(&sbi->s_dirs_counter,
ext4_count_dirs(sb));
}
+ if (!err) {
+ err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
+ }
if (err) {
printk(KERN_ERR "EXT4-fs: insufficient memory\n");
goto failed_mount3;
}
- /* per fileystem reservation list head & lock */
- spin_lock_init(&sbi->s_rsv_window_lock);
- sbi->s_rsv_window_root = RB_ROOT;
- /* Add a single, static dummy reservation to the start of the
- * reservation window list --- it gives us a placeholder for
- * append-at-start-of-list which makes the allocation logic
- * _much_ simpler. */
- sbi->s_rsv_window_head.rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
- sbi->s_rsv_window_head.rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED;
- sbi->s_rsv_window_head.rsv_alloc_hit = 0;
- sbi->s_rsv_window_head.rsv_goal_size = 0;
- ext4_rsv_window_add(sb, &sbi->s_rsv_window_head);
-
sbi->s_stripe = ext4_get_stripe_size(sbi);
/*
@@ -2202,11 +2319,11 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
EXT4_SB(sb)->s_journal->j_failed_commit) {
printk(KERN_CRIT "EXT4-fs error (device %s): "
"ext4_fill_super: Journal transaction "
- "%u is corrupt\n", sb->s_id,
+ "%u is corrupt\n", sb->s_id,
EXT4_SB(sb)->s_journal->j_failed_commit);
- if (test_opt (sb, ERRORS_RO)) {
- printk (KERN_CRIT
- "Mounting filesystem read-only\n");
+ if (test_opt(sb, ERRORS_RO)) {
+ printk(KERN_CRIT
+ "Mounting filesystem read-only\n");
sb->s_flags |= MS_RDONLY;
EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
@@ -2226,9 +2343,9 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
goto failed_mount3;
} else {
if (!silent)
- printk (KERN_ERR
- "ext4: No journal on filesystem on %s\n",
- sb->s_id);
+ printk(KERN_ERR
+ "ext4: No journal on filesystem on %s\n",
+ sb->s_id);
goto failed_mount3;
}
@@ -2312,7 +2429,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
goto failed_mount4;
}
- ext4_setup_super (sb, es, sb->s_flags & MS_RDONLY);
+ ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY);
/* determine the minimum size of new large inodes, if present */
if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
@@ -2351,15 +2468,27 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
ext4_orphan_cleanup(sb, es);
EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
if (needs_recovery)
- printk (KERN_INFO "EXT4-fs: recovery complete.\n");
+ printk(KERN_INFO "EXT4-fs: recovery complete.\n");
ext4_mark_recovery_complete(sb, es);
- printk (KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n",
- test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal":
- test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
- "writeback");
+ printk(KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n",
+ test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal":
+ test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
+ "writeback");
+
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+ printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
+ "requested data journaling mode\n");
+ clear_opt(sbi->s_mount_opt, DELALLOC);
+ } else if (test_opt(sb, DELALLOC))
+ printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
ext4_ext_init(sb);
- ext4_mb_init(sb, needs_recovery);
+ err = ext4_mb_init(sb, needs_recovery);
+ if (err) {
+ printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n",
+ err);
+ goto failed_mount4;
+ }
lock_kernel();
return 0;
@@ -2372,15 +2501,21 @@ cantfind_ext4:
failed_mount4:
jbd2_journal_destroy(sbi->s_journal);
+ sbi->s_journal = NULL;
failed_mount3:
percpu_counter_destroy(&sbi->s_freeblocks_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
+ percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
failed_mount2:
for (i = 0; i < db_count; i++)
brelse(sbi->s_group_desc[i]);
kfree(sbi->s_group_desc);
failed_mount:
+ if (sbi->s_proc) {
+ remove_proc_entry("inode_readahead_blks", sbi->s_proc);
+ remove_proc_entry(sb->s_id, ext4_proc_root);
+ }
#ifdef CONFIG_QUOTA
for (i = 0; i < MAXQUOTAS; i++)
kfree(sbi->s_qf_names[i]);
@@ -2414,6 +2549,10 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
journal->j_flags |= JBD2_BARRIER;
else
journal->j_flags &= ~JBD2_BARRIER;
+ if (test_opt(sb, DATA_ERR_ABORT))
+ journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
+ else
+ journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
spin_unlock(&journal->j_state_lock);
}
@@ -2439,7 +2578,7 @@ static journal_t *ext4_get_journal(struct super_block *sb,
return NULL;
}
- jbd_debug(2, "Journal inode found at %p: %Ld bytes\n",
+ jbd_debug(2, "Journal inode found at %p: %lld bytes\n",
journal_inode, journal_inode->i_size);
if (!S_ISREG(journal_inode->i_mode)) {
printk(KERN_ERR "EXT4-fs: invalid journal inode.\n");
@@ -2461,14 +2600,14 @@ static journal_t *ext4_get_journal(struct super_block *sb,
static journal_t *ext4_get_dev_journal(struct super_block *sb,
dev_t j_dev)
{
- struct buffer_head * bh;
+ struct buffer_head *bh;
journal_t *journal;
ext4_fsblk_t start;
ext4_fsblk_t len;
int hblock, blocksize;
ext4_fsblk_t sb_block;
unsigned long offset;
- struct ext4_super_block * es;
+ struct ext4_super_block *es;
struct block_device *bdev;
bdev = ext4_blkdev_get(j_dev);
@@ -2583,8 +2722,8 @@ static int ext4_load_journal(struct super_block *sb,
"unavailable, cannot proceed.\n");
return -EROFS;
}
- printk (KERN_INFO "EXT4-fs: write access will "
- "be enabled during recovery.\n");
+ printk(KERN_INFO "EXT4-fs: write access will "
+ "be enabled during recovery.\n");
}
}
@@ -2602,6 +2741,11 @@ static int ext4_load_journal(struct super_block *sb,
return -EINVAL;
}
+ if (journal->j_flags & JBD2_BARRIER)
+ printk(KERN_INFO "EXT4-fs: barriers enabled\n");
+ else
+ printk(KERN_INFO "EXT4-fs: barriers disabled\n");
+
if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
err = jbd2_journal_update_format(journal);
if (err) {
@@ -2637,8 +2781,8 @@ static int ext4_load_journal(struct super_block *sb,
return 0;
}
-static int ext4_create_journal(struct super_block * sb,
- struct ext4_super_block * es,
+static int ext4_create_journal(struct super_block *sb,
+ struct ext4_super_block *es,
unsigned int journal_inum)
{
journal_t *journal;
@@ -2679,21 +2823,41 @@ static int ext4_create_journal(struct super_block * sb,
return 0;
}
-static void ext4_commit_super (struct super_block * sb,
- struct ext4_super_block * es,
- int sync)
+static void ext4_commit_super(struct super_block *sb,
+ struct ext4_super_block *es, int sync)
{
struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
if (!sbh)
return;
+ if (buffer_write_io_error(sbh)) {
+ /*
+ * Oh, dear. A previous attempt to write the
+ * superblock failed. This could happen because the
+ * USB device was yanked out. Or it could happen to
+ * be a transient write error and maybe the block will
+ * be remapped. Nothing we can do but to retry the
+ * write and hope for the best.
+ */
+ printk(KERN_ERR "ext4: previous I/O error to "
+ "superblock detected for %s.\n", sb->s_id);
+ clear_buffer_write_io_error(sbh);
+ set_buffer_uptodate(sbh);
+ }
es->s_wtime = cpu_to_le32(get_seconds());
ext4_free_blocks_count_set(es, ext4_count_free_blocks(sb));
es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb));
BUFFER_TRACE(sbh, "marking dirty");
mark_buffer_dirty(sbh);
- if (sync)
+ if (sync) {
sync_dirty_buffer(sbh);
+ if (buffer_write_io_error(sbh)) {
+ printk(KERN_ERR "ext4: I/O error while writing "
+ "superblock for %s.\n", sb->s_id);
+ clear_buffer_write_io_error(sbh);
+ set_buffer_uptodate(sbh);
+ }
+ }
}
@@ -2702,13 +2866,15 @@ static void ext4_commit_super (struct super_block * sb,
* remounting) the filesystem readonly, then we will end up with a
* consistent fs on disk. Record that fact.
*/
-static void ext4_mark_recovery_complete(struct super_block * sb,
- struct ext4_super_block * es)
+static void ext4_mark_recovery_complete(struct super_block *sb,
+ struct ext4_super_block *es)
{
journal_t *journal = EXT4_SB(sb)->s_journal;
jbd2_journal_lock_updates(journal);
- jbd2_journal_flush(journal);
+ if (jbd2_journal_flush(journal) < 0)
+ goto out;
+
lock_super(sb);
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
sb->s_flags & MS_RDONLY) {
@@ -2717,6 +2883,8 @@ static void ext4_mark_recovery_complete(struct super_block * sb,
ext4_commit_super(sb, es, 1);
}
unlock_super(sb);
+
+out:
jbd2_journal_unlock_updates(journal);
}
@@ -2725,8 +2893,8 @@ static void ext4_mark_recovery_complete(struct super_block * sb,
* has recorded an error from a previous lifetime, move that error to the
* main filesystem now.
*/
-static void ext4_clear_journal_err(struct super_block * sb,
- struct ext4_super_block * es)
+static void ext4_clear_journal_err(struct super_block *sb,
+ struct ext4_super_block *es)
{
journal_t *journal;
int j_errno;
@@ -2751,7 +2919,7 @@ static void ext4_clear_journal_err(struct super_block * sb,
EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
- ext4_commit_super (sb, es, 1);
+ ext4_commit_super(sb, es, 1);
jbd2_journal_clear_err(journal);
}
@@ -2784,7 +2952,7 @@ int ext4_force_commit(struct super_block *sb)
* This implicitly triggers the writebehind on sync().
*/
-static void ext4_write_super (struct super_block * sb)
+static void ext4_write_super(struct super_block *sb)
{
if (mutex_trylock(&sb->s_lock) != 0)
BUG();
@@ -2795,6 +2963,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
{
tid_t target;
+ trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
sb->s_dirt = 0;
if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
if (wait)
@@ -2816,7 +2985,13 @@ static void ext4_write_super_lockfs(struct super_block *sb)
/* Now we set up the journal barrier. */
jbd2_journal_lock_updates(journal);
- jbd2_journal_flush(journal);
+
+ /*
+ * We don't want to clear needs_recovery flag when we failed
+ * to flush the journal.
+ */
+ if (jbd2_journal_flush(journal) < 0)
+ return;
/* Journal blocked and flushed, clear needs_recovery flag. */
EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
@@ -2840,13 +3015,14 @@ static void ext4_unlockfs(struct super_block *sb)
}
}
-static int ext4_remount (struct super_block * sb, int * flags, char * data)
+static int ext4_remount(struct super_block *sb, int *flags, char *data)
{
- struct ext4_super_block * es;
+ struct ext4_super_block *es;
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_fsblk_t n_blocks_count = 0;
unsigned long old_sb_flags;
struct ext4_mount_options old_opts;
+ ext4_group_t g;
int err;
#ifdef CONFIG_QUOTA
int i;
@@ -2925,6 +3101,26 @@ static int ext4_remount (struct super_block * sb, int * flags, char * data)
}
/*
+ * Make sure the group descriptor checksums
+ * are sane. If they aren't, refuse to
+ * remount r/w.
+ */
+ for (g = 0; g < sbi->s_groups_count; g++) {
+ struct ext4_group_desc *gdp =
+ ext4_get_group_desc(sb, g, NULL);
+
+ if (!ext4_group_desc_csum_verify(sbi, g, gdp)) {
+ printk(KERN_ERR
+ "EXT4-fs: ext4_remount: "
+ "Checksum for group %lu failed (%u!=%u)\n",
+ g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
+ le16_to_cpu(gdp->bg_checksum));
+ err = -EINVAL;
+ goto restore_opts;
+ }
+ }
+
+ /*
* If we have an unprocessed orphan list hanging
* around from a previously readonly bdev mount,
* require a full umount/remount for now.
@@ -2949,7 +3145,7 @@ static int ext4_remount (struct super_block * sb, int * flags, char * data)
sbi->s_mount_state = le16_to_cpu(es->s_state);
if ((err = ext4_group_extend(sb, es, n_blocks_count)))
goto restore_opts;
- if (!ext4_setup_super (sb, es, 0))
+ if (!ext4_setup_super(sb, es, 0))
sb->s_flags &= ~MS_RDONLY;
}
}
@@ -2979,7 +3175,7 @@ restore_opts:
return err;
}
-static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf)
+static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -3029,7 +3225,8 @@ static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf)
buf->f_type = EXT4_SUPER_MAGIC;
buf->f_bsize = sb->s_blocksize;
buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
- buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
+ buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
+ percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
ext4_free_blocks_count_set(es, buf->f_bfree);
buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
if (buf->f_bfree < ext4_r_blocks_count(es))
@@ -3217,12 +3414,12 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
}
/* Journaling quota? */
if (EXT4_SB(sb)->s_qf_names[type]) {
- /* Quotafile not of fs root? */
+ /* Quotafile not in fs root? */
if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode)
printk(KERN_WARNING
"EXT4-fs: Quota file not on filesystem root. "
"Journaled quota will not work.\n");
- }
+ }
/*
* When we journal data on quota file, we have to flush journal to see
@@ -3234,12 +3431,17 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
* otherwise be livelocked...
*/
jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
- jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+ err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+ if (err) {
+ path_put(&nd.path);
+ return err;
+ }
}
+ err = vfs_quota_on_path(sb, type, format_id, &nd.path);
path_put(&nd.path);
- return vfs_quota_on(sb, type, format_id, path, remount);
+ return err;
}
/* Read data from quotafile - avoid pagecache and such because we cannot afford
@@ -3298,7 +3500,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
handle_t *handle = journal_current_handle();
if (!handle) {
- printk(KERN_WARNING "EXT4-fs: Quota write (off=%Lu, len=%Lu)"
+ printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)"
" cancelled because transaction is not started.\n",
(unsigned long long)off, (unsigned long long)len);
return -EIO;
@@ -3325,7 +3527,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
err = ext4_journal_dirty_metadata(handle, bh);
else {
/* Always do at least ordered writes for quotas */
- err = ext4_journal_dirty_data(handle, bh);
+ err = ext4_jbd2_file_inode(handle, inode);
mark_buffer_dirty(bh);
}
brelse(bh);
@@ -3337,8 +3539,10 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
blk++;
}
out:
- if (len == towrite)
+ if (len == towrite) {
+ mutex_unlock(&inode->i_mutex);
return err;
+ }
if (inode->i_size < off+len-towrite) {
i_size_write(inode, off+len-towrite);
EXT4_I(inode)->i_disksize = inode->i_size;
@@ -3357,18 +3561,82 @@ static int ext4_get_sb(struct file_system_type *fs_type,
return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
}
+#ifdef CONFIG_PROC_FS
+static int ext4_ui_proc_show(struct seq_file *m, void *v)
+{
+ unsigned int *p = m->private;
+
+ seq_printf(m, "%u\n", *p);
+ return 0;
+}
+
+static int ext4_ui_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, ext4_ui_proc_show, PDE(inode)->data);
+}
+
+static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf,
+ size_t cnt, loff_t *ppos)
+{
+ unsigned int *p = PDE(file->f_path.dentry->d_inode)->data;
+ char str[32];
+ unsigned long value;
+
+ if (cnt >= sizeof(str))
+ return -EINVAL;
+ if (copy_from_user(str, buf, cnt))
+ return -EFAULT;
+ value = simple_strtol(str, NULL, 0);
+ if (value < 0)
+ return -ERANGE;
+ *p = value;
+ return cnt;
+}
+
+const struct file_operations ext4_ui_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = ext4_ui_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+ .write = ext4_ui_proc_write,
+};
+#endif
+
+static struct file_system_type ext4_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "ext4",
+ .get_sb = ext4_get_sb,
+ .kill_sb = kill_block_super,
+ .fs_flags = FS_REQUIRES_DEV,
+};
+
+#ifdef CONFIG_EXT4DEV_COMPAT
+static int ext4dev_get_sb(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+{
+ printk(KERN_WARNING "EXT4-fs: Update your userspace programs "
+ "to mount using ext4\n");
+ printk(KERN_WARNING "EXT4-fs: ext4dev backwards compatibility "
+ "will go away by 2.6.31\n");
+ return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
+}
+
static struct file_system_type ext4dev_fs_type = {
.owner = THIS_MODULE,
.name = "ext4dev",
- .get_sb = ext4_get_sb,
+ .get_sb = ext4dev_get_sb,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
};
+MODULE_ALIAS("ext4dev");
+#endif
static int __init init_ext4_fs(void)
{
int err;
+ ext4_proc_root = proc_mkdir("fs/ext4", NULL);
err = init_ext4_mballoc();
if (err)
return err;
@@ -3379,9 +3647,16 @@ static int __init init_ext4_fs(void)
err = init_inodecache();
if (err)
goto out1;
- err = register_filesystem(&ext4dev_fs_type);
+ err = register_filesystem(&ext4_fs_type);
if (err)
goto out;
+#ifdef CONFIG_EXT4DEV_COMPAT
+ err = register_filesystem(&ext4dev_fs_type);
+ if (err) {
+ unregister_filesystem(&ext4_fs_type);
+ goto out;
+ }
+#endif
return 0;
out:
destroy_inodecache();
@@ -3394,10 +3669,14 @@ out2:
static void __exit exit_ext4_fs(void)
{
+ unregister_filesystem(&ext4_fs_type);
+#ifdef CONFIG_EXT4DEV_COMPAT
unregister_filesystem(&ext4dev_fs_type);
+#endif
destroy_inodecache();
exit_ext4_xattr();
exit_ext4_mballoc();
+ remove_proc_entry("fs/ext4", NULL);
}
MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index e9178643dc0..00740cb32be 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -23,10 +23,10 @@
#include "ext4.h"
#include "xattr.h"
-static void * ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
+static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
{
struct ext4_inode_info *ei = EXT4_I(dentry->d_inode);
- nd_set_link(nd, (char*)ei->i_data);
+ nd_set_link(nd, (char *) ei->i_data);
return NULL;
}
@@ -34,7 +34,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = page_follow_link_light,
.put_link = page_put_link,
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
@@ -45,7 +45,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
const struct inode_operations ext4_fast_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = ext4_follow_link,
-#ifdef CONFIG_EXT4DEV_FS_XATTR
+#ifdef CONFIG_EXT4_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index ff08633f398..80626d516fe 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -99,12 +99,12 @@ static struct mb_cache *ext4_xattr_cache;
static struct xattr_handler *ext4_xattr_handler_map[] = {
[EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler,
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
[EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler,
[EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler,
#endif
[EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler,
-#ifdef CONFIG_EXT4DEV_FS_SECURITY
+#ifdef CONFIG_EXT4_FS_SECURITY
[EXT4_XATTR_INDEX_SECURITY] = &ext4_xattr_security_handler,
#endif
};
@@ -112,11 +112,11 @@ static struct xattr_handler *ext4_xattr_handler_map[] = {
struct xattr_handler *ext4_xattr_handlers[] = {
&ext4_xattr_user_handler,
&ext4_xattr_trusted_handler,
-#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
&ext4_xattr_acl_access_handler,
&ext4_xattr_acl_default_handler,
#endif
-#ifdef CONFIG_EXT4DEV_FS_SECURITY
+#ifdef CONFIG_EXT4_FS_SECURITY
&ext4_xattr_security_handler,
#endif
NULL
@@ -810,7 +810,7 @@ inserted:
/* We need to allocate a new block */
ext4_fsblk_t goal = ext4_group_first_block_no(sb,
EXT4_I(inode)->i_block_group);
- ext4_fsblk_t block = ext4_new_block(handle, inode,
+ ext4_fsblk_t block = ext4_new_meta_block(handle, inode,
goal, &error);
if (error)
goto cleanup;
@@ -959,6 +959,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
struct ext4_xattr_block_find bs = {
.s = { .not_found = -ENODATA, },
};
+ unsigned long no_expand;
int error;
if (!name)
@@ -966,6 +967,9 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
if (strlen(name) > 255)
return -ERANGE;
down_write(&EXT4_I(inode)->xattr_sem);
+ no_expand = EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND;
+ EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND;
+
error = ext4_get_inode_loc(inode, &is.iloc);
if (error)
goto cleanup;
@@ -1042,6 +1046,8 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
cleanup:
brelse(is.iloc.bh);
brelse(bs.bh);
+ if (no_expand == 0)
+ EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND;
up_write(&EXT4_I(inode)->xattr_sem);
return error;
}
@@ -1512,7 +1518,7 @@ static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header,
char *name = entry->e_name;
int n;
- for (n=0; n < entry->e_name_len; n++) {
+ for (n = 0; n < entry->e_name_len; n++) {
hash = (hash << NAME_HASH_SHIFT) ^
(hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
*name++;
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 5992fe979bb..8ede88b18c2 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -51,8 +51,8 @@ struct ext4_xattr_entry {
(((name_len) + EXT4_XATTR_ROUND + \
sizeof(struct ext4_xattr_entry)) & ~EXT4_XATTR_ROUND)
#define EXT4_XATTR_NEXT(entry) \
- ( (struct ext4_xattr_entry *)( \
- (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len)) )
+ ((struct ext4_xattr_entry *)( \
+ (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len)))
#define EXT4_XATTR_SIZE(size) \
(((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND)
@@ -63,7 +63,7 @@ struct ext4_xattr_entry {
EXT4_I(inode)->i_extra_isize))
#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
-# ifdef CONFIG_EXT4DEV_FS_XATTR
+# ifdef CONFIG_EXT4_FS_XATTR
extern struct xattr_handler ext4_xattr_user_handler;
extern struct xattr_handler ext4_xattr_trusted_handler;
@@ -88,7 +88,7 @@ extern void exit_ext4_xattr(void);
extern struct xattr_handler *ext4_xattr_handlers[];
-# else /* CONFIG_EXT4DEV_FS_XATTR */
+# else /* CONFIG_EXT4_FS_XATTR */
static inline int
ext4_xattr_get(struct inode *inode, int name_index, const char *name,
@@ -141,9 +141,9 @@ ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
#define ext4_xattr_handlers NULL
-# endif /* CONFIG_EXT4DEV_FS_XATTR */
+# endif /* CONFIG_EXT4_FS_XATTR */
-#ifdef CONFIG_EXT4DEV_FS_SECURITY
+#ifdef CONFIG_EXT4_FS_SECURITY
extern int ext4_init_security(handle_t *handle, struct inode *inode,
struct inode *dir);
#else
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index fff33382cad..ac1a52cf2a3 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -13,13 +13,11 @@
#include "ext4.h"
#include "xattr.h"
-#define XATTR_TRUSTED_PREFIX "trusted."
-
static size_t
ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
const char *name, size_t name_len)
{
- const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
+ const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
if (!capable(CAP_SYS_ADMIN))
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index 67be723fcc4..d91aa61b42a 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -12,13 +12,11 @@
#include "ext4.h"
#include "xattr.h"
-#define XATTR_USER_PREFIX "user."
-
static size_t
ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size,
const char *name, size_t name_len)
{
- const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1;
+ const size_t prefix_len = XATTR_USER_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
if (!test_opt(inode->i_sb, XATTR_USER))
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index fda25479af2..3222f51c41c 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -36,7 +36,7 @@ static inline int fat_max_cache(struct inode *inode)
static struct kmem_cache *fat_cache_cachep;
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
{
struct fat_cache *cache = (struct fat_cache *)foo;
@@ -61,7 +61,7 @@ void fat_cache_destroy(void)
static inline struct fat_cache *fat_cache_alloc(struct inode *inode)
{
- return kmem_cache_alloc(fat_cache_cachep, GFP_KERNEL);
+ return kmem_cache_alloc(fat_cache_cachep, GFP_NOFS);
}
static inline void fat_cache_free(struct fat_cache *cache)
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 486725ee99a..cd4a0162e10 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -17,7 +17,6 @@
#include <linux/slab.h>
#include <linux/time.h>
#include <linux/msdos_fs.h>
-#include <linux/dirent.h>
#include <linux/smp_lock.h>
#include <linux/buffer_head.h>
#include <linux/compat.h>
@@ -124,10 +123,11 @@ static inline int fat_get_entry(struct inode *dir, loff_t *pos,
* but ignore that right now.
* Ahem... Stack smashing in ring 0 isn't fun. Fixed.
*/
-static int uni16_to_x8(unsigned char *ascii, wchar_t *uni, int len,
+static int uni16_to_x8(unsigned char *ascii, const wchar_t *uni, int len,
int uni_xlate, struct nls_table *nls)
{
- wchar_t *ip, ec;
+ const wchar_t *ip;
+ wchar_t ec;
unsigned char *op, nc;
int charlen;
int k;
@@ -167,6 +167,16 @@ static int uni16_to_x8(unsigned char *ascii, wchar_t *uni, int len,
return (op - ascii);
}
+static inline int fat_uni_to_x8(struct msdos_sb_info *sbi, const wchar_t *uni,
+ unsigned char *buf, int size)
+{
+ if (sbi->options.utf8)
+ return utf8_wcstombs(buf, uni, size);
+ else
+ return uni16_to_x8(buf, uni, size, sbi->options.unicode_xlate,
+ sbi->nls_io);
+}
+
static inline int
fat_short2uni(struct nls_table *t, unsigned char *c, int clen, wchar_t *uni)
{
@@ -227,6 +237,19 @@ fat_shortname2uni(struct nls_table *nls, unsigned char *buf, int buf_size,
return len;
}
+static inline int fat_name_match(struct msdos_sb_info *sbi,
+ const unsigned char *a, int a_len,
+ const unsigned char *b, int b_len)
+{
+ if (a_len != b_len)
+ return 0;
+
+ if (sbi->options.name_check != 's')
+ return !nls_strnicmp(sbi->nls_io, a, b, a_len);
+ else
+ return !memcmp(a, b, a_len);
+}
+
enum { PARSE_INVALID = 1, PARSE_NOT_LONGNAME, PARSE_EOF, };
/**
@@ -302,6 +325,19 @@ parse_long:
}
/*
+ * Maximum buffer size of short name.
+ * [(MSDOS_NAME + '.') * max one char + nul]
+ * For msdos style, ['.' (hidden) + MSDOS_NAME + '.' + nul]
+ */
+#define FAT_MAX_SHORT_SIZE ((MSDOS_NAME + 1) * NLS_MAX_CHARSET_SIZE + 1)
+/*
+ * Maximum buffer size of unicode chars from slots.
+ * [(max longname slots * 13 (size in a slot) + nul) * sizeof(wchar_t)]
+ */
+#define FAT_MAX_UNI_CHARS ((MSDOS_SLOTS - 1) * 13 + 1)
+#define FAT_MAX_UNI_SIZE (FAT_MAX_UNI_CHARS * sizeof(wchar_t))
+
+/*
* Return values: negative -> error, 0 -> not found, positive -> found,
* value is the total amount of slots, including the shortname entry.
*/
@@ -312,29 +348,20 @@ int fat_search_long(struct inode *inode, const unsigned char *name,
struct msdos_sb_info *sbi = MSDOS_SB(sb);
struct buffer_head *bh = NULL;
struct msdos_dir_entry *de;
- struct nls_table *nls_io = sbi->nls_io;
struct nls_table *nls_disk = sbi->nls_disk;
- wchar_t bufuname[14];
unsigned char nr_slots;
- int xlate_len;
+ wchar_t bufuname[14];
wchar_t *unicode = NULL;
unsigned char work[MSDOS_NAME];
- unsigned char *bufname = NULL;
- int uni_xlate = sbi->options.unicode_xlate;
- int utf8 = sbi->options.utf8;
- int anycase = (sbi->options.name_check != 's');
+ unsigned char bufname[FAT_MAX_SHORT_SIZE];
unsigned short opt_shortname = sbi->options.shortname;
loff_t cpos = 0;
- int chl, i, j, last_u, err;
-
- bufname = __getname();
- if (!bufname)
- return -ENOMEM;
+ int chl, i, j, last_u, err, len;
err = -ENOENT;
- while(1) {
+ while (1) {
if (fat_get_entry(inode, &cpos, &bh, &de) == -1)
- goto EODir;
+ goto end_of_dir;
parse_record:
nr_slots = 0;
if (de->name[0] == DELETED_FLAG)
@@ -353,7 +380,7 @@ parse_record:
else if (status == PARSE_NOT_LONGNAME)
goto parse_record;
else if (status == PARSE_EOF)
- goto EODir;
+ goto end_of_dir;
}
memcpy(work, de->name, sizeof(de->name));
@@ -394,30 +421,24 @@ parse_record:
if (!last_u)
continue;
+ /* Compare shortname */
bufuname[last_u] = 0x0000;
- xlate_len = utf8
- ?utf8_wcstombs(bufname, bufuname, PATH_MAX)
- :uni16_to_x8(bufname, bufuname, PATH_MAX, uni_xlate, nls_io);
- if (xlate_len == name_len)
- if ((!anycase && !memcmp(name, bufname, xlate_len)) ||
- (anycase && !nls_strnicmp(nls_io, name, bufname,
- xlate_len)))
- goto Found;
+ len = fat_uni_to_x8(sbi, bufuname, bufname, sizeof(bufname));
+ if (fat_name_match(sbi, name, name_len, bufname, len))
+ goto found;
if (nr_slots) {
- xlate_len = utf8
- ?utf8_wcstombs(bufname, unicode, PATH_MAX)
- :uni16_to_x8(bufname, unicode, PATH_MAX, uni_xlate, nls_io);
- if (xlate_len != name_len)
- continue;
- if ((!anycase && !memcmp(name, bufname, xlate_len)) ||
- (anycase && !nls_strnicmp(nls_io, name, bufname,
- xlate_len)))
- goto Found;
+ void *longname = unicode + FAT_MAX_UNI_CHARS;
+ int size = PATH_MAX - FAT_MAX_UNI_SIZE;
+
+ /* Compare longname */
+ len = fat_uni_to_x8(sbi, unicode, longname, size);
+ if (fat_name_match(sbi, name, name_len, longname, len))
+ goto found;
}
}
-Found:
+found:
nr_slots++; /* include the de */
sinfo->slot_off = cpos - nr_slots * sizeof(*de);
sinfo->nr_slots = nr_slots;
@@ -425,9 +446,7 @@ Found:
sinfo->bh = bh;
sinfo->i_pos = fat_make_i_pos(sb, sinfo->bh, sinfo->de);
err = 0;
-EODir:
- if (bufname)
- __putname(bufname);
+end_of_dir:
if (unicode)
__putname(unicode);
@@ -453,26 +472,23 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
struct msdos_sb_info *sbi = MSDOS_SB(sb);
struct buffer_head *bh;
struct msdos_dir_entry *de;
- struct nls_table *nls_io = sbi->nls_io;
struct nls_table *nls_disk = sbi->nls_disk;
- unsigned char long_slots;
- const char *fill_name;
- int fill_len;
+ unsigned char nr_slots;
wchar_t bufuname[14];
wchar_t *unicode = NULL;
- unsigned char c, work[MSDOS_NAME], bufname[56], *ptname = bufname;
- unsigned long lpos, dummy, *furrfu = &lpos;
- int uni_xlate = sbi->options.unicode_xlate;
+ unsigned char c, work[MSDOS_NAME];
+ unsigned char bufname[FAT_MAX_SHORT_SIZE], *ptname = bufname;
+ unsigned short opt_shortname = sbi->options.shortname;
int isvfat = sbi->options.isvfat;
- int utf8 = sbi->options.utf8;
int nocase = sbi->options.nocase;
- unsigned short opt_shortname = sbi->options.shortname;
+ const char *fill_name = NULL;
unsigned long inum;
- int chi, chl, i, i2, j, last, last_u, dotoffset = 0;
+ unsigned long lpos, dummy, *furrfu = &lpos;
loff_t cpos;
+ int chi, chl, i, i2, j, last, last_u, dotoffset = 0, fill_len = 0;
int ret = 0;
- lock_kernel();
+ lock_super(sb);
cpos = filp->f_pos;
/* Fake . and .. for the root directory. */
@@ -489,43 +505,58 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
cpos = 0;
}
}
- if (cpos & (sizeof(struct msdos_dir_entry)-1)) {
+ if (cpos & (sizeof(struct msdos_dir_entry) - 1)) {
ret = -ENOENT;
goto out;
}
bh = NULL;
-GetNew:
+get_new:
if (fat_get_entry(inode, &cpos, &bh, &de) == -1)
- goto EODir;
+ goto end_of_dir;
parse_record:
- long_slots = 0;
- /* Check for long filename entry */
- if (isvfat) {
+ nr_slots = 0;
+ /*
+ * Check for long filename entry, but if short_only, we don't
+ * need to parse long filename.
+ */
+ if (isvfat && !short_only) {
if (de->name[0] == DELETED_FLAG)
- goto RecEnd;
+ goto record_end;
if (de->attr != ATTR_EXT && (de->attr & ATTR_VOLUME))
- goto RecEnd;
+ goto record_end;
if (de->attr != ATTR_EXT && IS_FREE(de->name))
- goto RecEnd;
+ goto record_end;
} else {
if ((de->attr & ATTR_VOLUME) || IS_FREE(de->name))
- goto RecEnd;
+ goto record_end;
}
if (isvfat && de->attr == ATTR_EXT) {
int status = fat_parse_long(inode, &cpos, &bh, &de,
- &unicode, &long_slots);
+ &unicode, &nr_slots);
if (status < 0) {
filp->f_pos = cpos;
ret = status;
goto out;
} else if (status == PARSE_INVALID)
- goto RecEnd;
+ goto record_end;
else if (status == PARSE_NOT_LONGNAME)
goto parse_record;
else if (status == PARSE_EOF)
- goto EODir;
+ goto end_of_dir;
+
+ if (nr_slots) {
+ void *longname = unicode + FAT_MAX_UNI_CHARS;
+ int size = PATH_MAX - FAT_MAX_UNI_SIZE;
+ int len = fat_uni_to_x8(sbi, unicode, longname, size);
+
+ fill_name = longname;
+ fill_len = len;
+ /* !both && !short_only, so we don't need shortname. */
+ if (!both)
+ goto start_filldir;
+ }
}
if (sbi->options.dotsOK) {
@@ -587,12 +618,32 @@ parse_record:
}
}
if (!last)
- goto RecEnd;
+ goto record_end;
i = last + dotoffset;
j = last_u;
- lpos = cpos - (long_slots+1)*sizeof(struct msdos_dir_entry);
+ if (isvfat) {
+ bufuname[j] = 0x0000;
+ i = fat_uni_to_x8(sbi, bufuname, bufname, sizeof(bufname));
+ }
+ if (nr_slots) {
+ /* hack for fat_ioctl_filldir() */
+ struct fat_ioctl_filldir_callback *p = dirent;
+
+ p->longname = fill_name;
+ p->long_len = fill_len;
+ p->shortname = bufname;
+ p->short_len = i;
+ fill_name = NULL;
+ fill_len = 0;
+ } else {
+ fill_name = bufname;
+ fill_len = i;
+ }
+
+start_filldir:
+ lpos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry);
if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME))
inum = inode->i_ino;
else if (!memcmp(de->name, MSDOS_DOTDOT, MSDOS_NAME)) {
@@ -607,54 +658,22 @@ parse_record:
inum = iunique(sb, MSDOS_ROOT_INO);
}
- if (isvfat) {
- bufuname[j] = 0x0000;
- i = utf8 ? utf8_wcstombs(bufname, bufuname, sizeof(bufname))
- : uni16_to_x8(bufname, bufuname, sizeof(bufname), uni_xlate, nls_io);
- }
-
- fill_name = bufname;
- fill_len = i;
- if (!short_only && long_slots) {
- /* convert the unicode long name. 261 is maximum size
- * of unicode buffer. (13 * slots + nul) */
- void *longname = unicode + 261;
- int buf_size = PATH_MAX - (261 * sizeof(unicode[0]));
- int long_len = utf8
- ? utf8_wcstombs(longname, unicode, buf_size)
- : uni16_to_x8(longname, unicode, buf_size, uni_xlate, nls_io);
-
- if (!both) {
- fill_name = longname;
- fill_len = long_len;
- } else {
- /* hack for fat_ioctl_filldir() */
- struct fat_ioctl_filldir_callback *p = dirent;
-
- p->longname = longname;
- p->long_len = long_len;
- p->shortname = bufname;
- p->short_len = i;
- fill_name = NULL;
- fill_len = 0;
- }
- }
if (filldir(dirent, fill_name, fill_len, *furrfu, inum,
(de->attr & ATTR_DIR) ? DT_DIR : DT_REG) < 0)
- goto FillFailed;
+ goto fill_failed;
-RecEnd:
+record_end:
furrfu = &lpos;
filp->f_pos = cpos;
- goto GetNew;
-EODir:
+ goto get_new;
+end_of_dir:
filp->f_pos = cpos;
-FillFailed:
+fill_failed:
brelse(bh);
if (unicode)
__putname(unicode);
out:
- unlock_kernel();
+ unlock_super(sb);
return ret;
}
@@ -715,7 +734,7 @@ efault: \
return -EFAULT; \
}
-FAT_IOCTL_FILLDIR_FUNC(fat_ioctl_filldir, dirent)
+FAT_IOCTL_FILLDIR_FUNC(fat_ioctl_filldir, __fat_dirent)
static int fat_ioctl_readdir(struct inode *inode, struct file *filp,
void __user *dirent, filldir_t filldir,
@@ -741,7 +760,7 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *filp,
static int fat_dir_ioctl(struct inode *inode, struct file *filp,
unsigned int cmd, unsigned long arg)
{
- struct dirent __user *d1 = (struct dirent __user *)arg;
+ struct __fat_dirent __user *d1 = (struct __fat_dirent __user *)arg;
int short_only, both;
switch (cmd) {
@@ -757,7 +776,7 @@ static int fat_dir_ioctl(struct inode *inode, struct file *filp,
return fat_generic_ioctl(inode, filp, cmd, arg);
}
- if (!access_ok(VERIFY_WRITE, d1, sizeof(struct dirent[2])))
+ if (!access_ok(VERIFY_WRITE, d1, sizeof(struct __fat_dirent[2])))
return -EFAULT;
/*
* Yes, we don't need this put_user() absolutely. However old
@@ -1082,7 +1101,7 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec *ts)
goto error_free;
}
- fat_date_unix2dos(ts->tv_sec, &time, &date);
+ fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc);
de = (struct msdos_dir_entry *)bhs[0]->b_data;
/* filling the new directory slots ("." and ".." entries) */
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 302e95c4af7..fb98b3d847e 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -6,6 +6,7 @@
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/msdos_fs.h>
+#include <linux/blkdev.h>
struct fatent_operations {
void (*ent_blocknr)(struct super_block *, int, int *, sector_t *);
@@ -535,6 +536,7 @@ int fat_free_clusters(struct inode *inode, int cluster)
struct fat_entry fatent;
struct buffer_head *bhs[MAX_BUF_PER_PAGE];
int i, err, nr_bhs;
+ int first_cl = cluster;
nr_bhs = 0;
fatent_init(&fatent);
@@ -551,6 +553,18 @@ int fat_free_clusters(struct inode *inode, int cluster)
goto error;
}
+ /*
+ * Issue discard for the sectors we no longer care about,
+ * batching contiguous clusters into one request
+ */
+ if (cluster != fatent.entry + 1) {
+ int nr_clus = fatent.entry - first_cl + 1;
+
+ sb_issue_discard(sb, fat_clus_to_blknr(sbi, first_cl),
+ nr_clus * sbi->sec_per_clus);
+ first_cl = cluster;
+ }
+
ops->ent_put(&fatent, FAT_ENT_FREE);
if (sbi->free_clusters != -1) {
sbi->free_clusters++;
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 771326b8047..ddde37025ca 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -11,11 +11,12 @@
#include <linux/mount.h>
#include <linux/time.h>
#include <linux/msdos_fs.h>
-#include <linux/smp_lock.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/blkdev.h>
+#include <linux/fsnotify.h>
+#include <linux/security.h>
int fat_generic_ioctl(struct inode *inode, struct file *filp,
unsigned int cmd, unsigned long arg)
@@ -65,6 +66,7 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
/* Equivalent to a chmod() */
ia.ia_valid = ATTR_MODE | ATTR_CTIME;
+ ia.ia_ctime = current_fs_time(inode->i_sb);
if (is_dir) {
ia.ia_mode = MSDOS_MKMODE(attr,
S_IRWXUGO & ~sbi->options.fs_dmask)
@@ -91,11 +93,21 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
}
}
+ /*
+ * The security check is questionable... We single
+ * out the RO attribute for checking by the security
+ * module, just because it maps to a file mode.
+ */
+ err = security_inode_setattr(filp->f_path.dentry, &ia);
+ if (err)
+ goto up;
+
/* This MUST be done before doing anything irreversible... */
- err = notify_change(filp->f_path.dentry, &ia);
+ err = fat_setattr(filp->f_path.dentry, &ia);
if (err)
goto up;
+ fsnotify_change(filp->f_path.dentry, ia.ia_valid);
if (sbi->options.sys_immutable) {
if (attr & ATTR_SYS)
inode->i_flags |= S_IMMUTABLE;
@@ -242,9 +254,7 @@ void fat_truncate(struct inode *inode)
nr_clusters = (inode->i_size + (cluster_size - 1)) >> sbi->cluster_bits;
- lock_kernel();
fat_free(inode, nr_clusters);
- unlock_kernel();
fat_flush_inodes(inode->i_sb, inode, NULL);
}
@@ -303,6 +313,8 @@ static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode)
return 0;
}
+#define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)
+
int fat_setattr(struct dentry *dentry, struct iattr *attr)
{
struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
@@ -310,8 +322,6 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
int error = 0;
unsigned int ia_valid;
- lock_kernel();
-
/*
* Expand the file. Since inode_setattr() updates ->i_size
* before calling the ->truncate(), but FAT needs to fill the
@@ -328,9 +338,9 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
/* Check for setting the inode time. */
ia_valid = attr->ia_valid;
- if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET)) {
+ if (ia_valid & TIMES_SET_FLAGS) {
if (fat_allow_set_time(sbi, inode))
- attr->ia_valid &= ~(ATTR_MTIME_SET | ATTR_ATIME_SET);
+ attr->ia_valid &= ~TIMES_SET_FLAGS;
}
error = inode_change_ok(inode, attr);
@@ -366,7 +376,6 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
error = inode_setattr(inode, attr);
out:
- unlock_kernel();
return error;
}
EXPORT_SYMBOL_GPL(fat_setattr);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 4e0a3dd9d67..d12cdf2a040 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -382,17 +382,20 @@ static int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1))
& ~((loff_t)sbi->cluster_size - 1)) >> 9;
inode->i_mtime.tv_sec =
- date_dos2unix(le16_to_cpu(de->time), le16_to_cpu(de->date));
+ date_dos2unix(le16_to_cpu(de->time), le16_to_cpu(de->date),
+ sbi->options.tz_utc);
inode->i_mtime.tv_nsec = 0;
if (sbi->options.isvfat) {
int secs = de->ctime_cs / 100;
int csecs = de->ctime_cs % 100;
inode->i_ctime.tv_sec =
date_dos2unix(le16_to_cpu(de->ctime),
- le16_to_cpu(de->cdate)) + secs;
+ le16_to_cpu(de->cdate),
+ sbi->options.tz_utc) + secs;
inode->i_ctime.tv_nsec = csecs * 10000000;
inode->i_atime.tv_sec =
- date_dos2unix(0, le16_to_cpu(de->adate));
+ date_dos2unix(0, le16_to_cpu(de->adate),
+ sbi->options.tz_utc);
inode->i_atime.tv_nsec = 0;
} else
inode->i_ctime = inode->i_atime = inode->i_mtime;
@@ -440,14 +443,13 @@ static void fat_delete_inode(struct inode *inode)
static void fat_clear_inode(struct inode *inode)
{
- struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+ struct super_block *sb = inode->i_sb;
+ struct msdos_sb_info *sbi = MSDOS_SB(sb);
- lock_kernel();
spin_lock(&sbi->inode_hash_lock);
fat_cache_inval_inode(inode);
hlist_del_init(&MSDOS_I(inode)->i_fat_hash);
spin_unlock(&sbi->inode_hash_lock);
- unlock_kernel();
}
static void fat_write_super(struct super_block *sb)
@@ -485,7 +487,7 @@ static struct kmem_cache *fat_inode_cachep;
static struct inode *fat_alloc_inode(struct super_block *sb)
{
struct msdos_inode_info *ei;
- ei = kmem_cache_alloc(fat_inode_cachep, GFP_KERNEL);
+ ei = kmem_cache_alloc(fat_inode_cachep, GFP_NOFS);
if (!ei)
return NULL;
return &ei->vfs_inode;
@@ -496,7 +498,7 @@ static void fat_destroy_inode(struct inode *inode)
kmem_cache_free(fat_inode_cachep, MSDOS_I(inode));
}
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
{
struct msdos_inode_info *ei = (struct msdos_inode_info *)foo;
@@ -560,26 +562,23 @@ static int fat_write_inode(struct inode *inode, int wait)
struct buffer_head *bh;
struct msdos_dir_entry *raw_entry;
loff_t i_pos;
- int err = 0;
+ int err;
retry:
i_pos = MSDOS_I(inode)->i_pos;
if (inode->i_ino == MSDOS_ROOT_INO || !i_pos)
return 0;
- lock_kernel();
bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits);
if (!bh) {
printk(KERN_ERR "FAT: unable to read inode block "
"for updating (i_pos %lld)\n", i_pos);
- err = -EIO;
- goto out;
+ return -EIO;
}
spin_lock(&sbi->inode_hash_lock);
if (i_pos != MSDOS_I(inode)->i_pos) {
spin_unlock(&sbi->inode_hash_lock);
brelse(bh);
- unlock_kernel();
goto retry;
}
@@ -592,21 +591,23 @@ retry:
raw_entry->attr = fat_attr(inode);
raw_entry->start = cpu_to_le16(MSDOS_I(inode)->i_logstart);
raw_entry->starthi = cpu_to_le16(MSDOS_I(inode)->i_logstart >> 16);
- fat_date_unix2dos(inode->i_mtime.tv_sec, &raw_entry->time, &raw_entry->date);
+ fat_date_unix2dos(inode->i_mtime.tv_sec, &raw_entry->time,
+ &raw_entry->date, sbi->options.tz_utc);
if (sbi->options.isvfat) {
__le16 atime;
- fat_date_unix2dos(inode->i_ctime.tv_sec,&raw_entry->ctime,&raw_entry->cdate);
- fat_date_unix2dos(inode->i_atime.tv_sec,&atime,&raw_entry->adate);
+ fat_date_unix2dos(inode->i_ctime.tv_sec, &raw_entry->ctime,
+ &raw_entry->cdate, sbi->options.tz_utc);
+ fat_date_unix2dos(inode->i_atime.tv_sec, &atime,
+ &raw_entry->adate, sbi->options.tz_utc);
raw_entry->ctime_cs = (inode->i_ctime.tv_sec & 1) * 100 +
inode->i_ctime.tv_nsec / 10000000;
}
spin_unlock(&sbi->inode_hash_lock);
mark_buffer_dirty(bh);
+ err = 0;
if (wait)
err = sync_dirty_buffer(bh);
brelse(bh);
-out:
- unlock_kernel();
return err;
}
@@ -736,6 +737,7 @@ fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable)
static struct dentry *fat_get_parent(struct dentry *child)
{
+ struct super_block *sb = child->d_sb;
struct buffer_head *bh;
struct msdos_dir_entry *de;
loff_t i_pos;
@@ -743,14 +745,14 @@ static struct dentry *fat_get_parent(struct dentry *child)
struct inode *inode;
int err;
- lock_kernel();
+ lock_super(sb);
err = fat_get_dotdot_entry(child->d_inode, &bh, &de, &i_pos);
if (err) {
parent = ERR_PTR(err);
goto out;
}
- inode = fat_build_inode(child->d_sb, de, i_pos);
+ inode = fat_build_inode(sb, de, i_pos);
brelse(bh);
if (IS_ERR(inode)) {
parent = ERR_CAST(inode);
@@ -762,7 +764,7 @@ static struct dentry *fat_get_parent(struct dentry *child)
parent = ERR_PTR(-ENOMEM);
}
out:
- unlock_kernel();
+ unlock_super(sb);
return parent;
}
@@ -836,6 +838,8 @@ static int fat_show_options(struct seq_file *m, struct vfsmount *mnt)
}
if (sbi->options.flush)
seq_puts(m, ",flush");
+ if (opts->tz_utc)
+ seq_puts(m, ",tz=UTC");
return 0;
}
@@ -848,10 +852,10 @@ enum {
Opt_charset, Opt_shortname_lower, Opt_shortname_win95,
Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes,
Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
- Opt_obsolate, Opt_flush, Opt_err,
+ Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_err,
};
-static match_table_t fat_tokens = {
+static const match_table_t fat_tokens = {
{Opt_check_r, "check=relaxed"},
{Opt_check_s, "check=strict"},
{Opt_check_n, "check=normal"},
@@ -883,16 +887,17 @@ static match_table_t fat_tokens = {
{Opt_obsolate, "cvf_options=%100s"},
{Opt_obsolate, "posix"},
{Opt_flush, "flush"},
+ {Opt_tz_utc, "tz=UTC"},
{Opt_err, NULL},
};
-static match_table_t msdos_tokens = {
+static const match_table_t msdos_tokens = {
{Opt_nodots, "nodots"},
{Opt_nodots, "dotsOK=no"},
{Opt_dots, "dots"},
{Opt_dots, "dotsOK=yes"},
{Opt_err, NULL}
};
-static match_table_t vfat_tokens = {
+static const match_table_t vfat_tokens = {
{Opt_charset, "iocharset=%s"},
{Opt_shortname_lower, "shortname=lower"},
{Opt_shortname_win95, "shortname=win95"},
@@ -947,10 +952,11 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
opts->utf8 = opts->unicode_xlate = 0;
opts->numtail = 1;
opts->usefree = opts->nocase = 0;
+ opts->tz_utc = 0;
*debug = 0;
if (!options)
- return 0;
+ goto out;
while ((p = strsep(&options, ",")) != NULL) {
int token;
@@ -1036,6 +1042,9 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
case Opt_flush:
opts->flush = 1;
break;
+ case Opt_tz_utc:
+ opts->tz_utc = 1;
+ break;
/* msdos specific */
case Opt_dots:
@@ -1104,10 +1113,13 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
return -EINVAL;
}
}
+
+out:
/* UTF-8 doesn't provide FAT semantics */
if (!strcmp(opts->iocharset, "utf8")) {
printk(KERN_ERR "FAT: utf8 is not a recommended IO charset"
- " for FAT filesystems, filesystem will be case sensitive!\n");
+ " for FAT filesystems, filesystem will be "
+ "case sensitive!\n");
}
/* If user doesn't specify allow_utime, it's initialized from dmask. */
@@ -1172,6 +1184,12 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
long error;
char buf[50];
+ /*
+ * GFP_KERNEL is ok here, because while we do hold the
+ * supeblock lock, memory pressure can't call back into
+ * the filesystem, since we're only just about to mount
+ * it and have no inodes etc active!
+ */
sbi = kzalloc(sizeof(struct msdos_sb_info), GFP_KERNEL);
if (!sbi)
return -ENOMEM;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 61f23511eac..79fb98ad36d 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -142,7 +142,7 @@ static int day_n[] = {
};
/* Convert a MS-DOS time/date pair to a UNIX date (seconds since 1 1 70). */
-int date_dos2unix(unsigned short time, unsigned short date)
+int date_dos2unix(unsigned short time, unsigned short date, int tz_utc)
{
int month, year, secs;
@@ -156,16 +156,18 @@ int date_dos2unix(unsigned short time, unsigned short date)
((date & 31)-1+day_n[month]+(year/4)+year*365-((year & 3) == 0 &&
month < 2 ? 1 : 0)+3653);
/* days since 1.1.70 plus 80's leap day */
- secs += sys_tz.tz_minuteswest*60;
+ if (!tz_utc)
+ secs += sys_tz.tz_minuteswest*60;
return secs;
}
/* Convert linear UNIX date to a MS-DOS time/date pair. */
-void fat_date_unix2dos(int unix_date, __le16 *time, __le16 *date)
+void fat_date_unix2dos(int unix_date, __le16 *time, __le16 *date, int tz_utc)
{
int day, year, nl_day, month;
- unix_date -= sys_tz.tz_minuteswest*60;
+ if (!tz_utc)
+ unix_date -= sys_tz.tz_minuteswest*60;
/* Jan 1 GMT 00:00:00 1980. But what about another time zone? */
if (unix_date < 315532800)
diff --git a/fs/fcntl.c b/fs/fcntl.c
index bfd776509a7..ac4f7db9f13 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -12,7 +12,6 @@
#include <linux/fdtable.h>
#include <linux/capability.h>
#include <linux/dnotify.h>
-#include <linux/smp_lock.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/security.h>
@@ -50,145 +49,94 @@ static int get_close_on_exec(unsigned int fd)
return res;
}
-/*
- * locate_fd finds a free file descriptor in the open_fds fdset,
- * expanding the fd arrays if necessary. Must be called with the
- * file_lock held for write.
- */
-
-static int locate_fd(unsigned int orig_start, int cloexec)
-{
- struct files_struct *files = current->files;
- unsigned int newfd;
- unsigned int start;
- int error;
- struct fdtable *fdt;
-
- spin_lock(&files->file_lock);
-
- error = -EINVAL;
- if (orig_start >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
- goto out;
-
-repeat:
- fdt = files_fdtable(files);
- /*
- * Someone might have closed fd's in the range
- * orig_start..fdt->next_fd
- */
- start = orig_start;
- if (start < files->next_fd)
- start = files->next_fd;
-
- newfd = start;
- if (start < fdt->max_fds)
- newfd = find_next_zero_bit(fdt->open_fds->fds_bits,
- fdt->max_fds, start);
-
- error = -EMFILE;
- if (newfd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
- goto out;
-
- error = expand_files(files, newfd);
- if (error < 0)
- goto out;
-
- /*
- * If we needed to expand the fs array we
- * might have blocked - try again.
- */
- if (error)
- goto repeat;
-
- if (start <= files->next_fd)
- files->next_fd = newfd + 1;
-
- FD_SET(newfd, fdt->open_fds);
- if (cloexec)
- FD_SET(newfd, fdt->close_on_exec);
- else
- FD_CLR(newfd, fdt->close_on_exec);
- error = newfd;
-
-out:
- spin_unlock(&files->file_lock);
- return error;
-}
-
-static int dupfd(struct file *file, unsigned int start, int cloexec)
-{
- int fd = locate_fd(start, cloexec);
- if (fd >= 0)
- fd_install(fd, file);
- else
- fput(file);
-
- return fd;
-}
-
-asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd)
+asmlinkage long sys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
{
int err = -EBADF;
struct file * file, *tofree;
struct files_struct * files = current->files;
struct fdtable *fdt;
- spin_lock(&files->file_lock);
- if (!(file = fcheck(oldfd)))
- goto out_unlock;
- err = newfd;
- if (newfd == oldfd)
- goto out_unlock;
- err = -EBADF;
- if (newfd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
- goto out_unlock;
- get_file(file); /* We are now finished with oldfd */
-
- err = expand_files(files, newfd);
- if (err < 0)
- goto out_fput;
+ if ((flags & ~O_CLOEXEC) != 0)
+ return -EINVAL;
- /* To avoid races with open() and dup(), we will mark the fd as
- * in-use in the open-file bitmap throughout the entire dup2()
- * process. This is quite safe: do_close() uses the fd array
- * entry, not the bitmap, to decide what work needs to be
- * done. --sct */
- /* Doesn't work. open() might be there first. --AV */
+ if (unlikely(oldfd == newfd))
+ return -EINVAL;
- /* Yes. It's a race. In user space. Nothing sane to do */
+ spin_lock(&files->file_lock);
+ err = expand_files(files, newfd);
+ file = fcheck(oldfd);
+ if (unlikely(!file))
+ goto Ebadf;
+ if (unlikely(err < 0)) {
+ if (err == -EMFILE)
+ goto Ebadf;
+ goto out_unlock;
+ }
+ /*
+ * We need to detect attempts to do dup2() over allocated but still
+ * not finished descriptor. NB: OpenBSD avoids that at the price of
+ * extra work in their equivalent of fget() - they insert struct
+ * file immediately after grabbing descriptor, mark it larval if
+ * more work (e.g. actual opening) is needed and make sure that
+ * fget() treats larval files as absent. Potentially interesting,
+ * but while extra work in fget() is trivial, locking implications
+ * and amount of surgery on open()-related paths in VFS are not.
+ * FreeBSD fails with -EBADF in the same situation, NetBSD "solution"
+ * deadlocks in rather amusing ways, AFAICS. All of that is out of
+ * scope of POSIX or SUS, since neither considers shared descriptor
+ * tables and this condition does not arise without those.
+ */
err = -EBUSY;
fdt = files_fdtable(files);
tofree = fdt->fd[newfd];
if (!tofree && FD_ISSET(newfd, fdt->open_fds))
- goto out_fput;
-
+ goto out_unlock;
+ get_file(file);
rcu_assign_pointer(fdt->fd[newfd], file);
FD_SET(newfd, fdt->open_fds);
- FD_CLR(newfd, fdt->close_on_exec);
+ if (flags & O_CLOEXEC)
+ FD_SET(newfd, fdt->close_on_exec);
+ else
+ FD_CLR(newfd, fdt->close_on_exec);
spin_unlock(&files->file_lock);
if (tofree)
filp_close(tofree, files);
- err = newfd;
-out:
- return err;
+
+ return newfd;
+
+Ebadf:
+ err = -EBADF;
out_unlock:
spin_unlock(&files->file_lock);
- goto out;
+ return err;
+}
-out_fput:
- spin_unlock(&files->file_lock);
- fput(file);
- goto out;
+asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd)
+{
+ if (unlikely(newfd == oldfd)) { /* corner case */
+ struct files_struct *files = current->files;
+ rcu_read_lock();
+ if (!fcheck_files(files, oldfd))
+ oldfd = -EBADF;
+ rcu_read_unlock();
+ return oldfd;
+ }
+ return sys_dup3(oldfd, newfd, 0);
}
asmlinkage long sys_dup(unsigned int fildes)
{
int ret = -EBADF;
- struct file * file = fget(fildes);
-
- if (file)
- ret = dupfd(file, 0, 0);
+ struct file *file = fget(fildes);
+
+ if (file) {
+ ret = get_unused_fd();
+ if (ret >= 0)
+ fd_install(ret, file);
+ else
+ fput(file);
+ }
return ret;
}
@@ -227,7 +175,6 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
if (error)
return error;
- lock_kernel();
if ((arg ^ filp->f_flags) & FASYNC) {
if (filp->f_op && filp->f_op->fasync) {
error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
@@ -238,7 +185,6 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
out:
- unlock_kernel();
return error;
}
@@ -313,8 +259,13 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
switch (cmd) {
case F_DUPFD:
case F_DUPFD_CLOEXEC:
- get_file(filp);
- err = dupfd(filp, arg, cmd == F_DUPFD_CLOEXEC);
+ if (arg >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
+ break;
+ err = alloc_fd(arg, cmd == F_DUPFD_CLOEXEC ? O_CLOEXEC : 0);
+ if (err >= 0) {
+ get_file(filp);
+ fd_install(err, filp);
+ }
break;
case F_GETFD:
err = get_close_on_exec(fd) ? FD_CLOEXEC : 0;
diff --git a/fs/fifo.c b/fs/fifo.c
index 9785e36f81e..987bf941149 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -57,7 +57,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
* POSIX.1 says that O_NONBLOCK means return with the FIFO
* opened, even when there is no process writing the FIFO.
*/
- filp->f_op = &read_fifo_fops;
+ filp->f_op = &read_pipefifo_fops;
pipe->r_counter++;
if (pipe->readers++ == 0)
wake_up_partner(inode);
@@ -86,7 +86,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
if ((filp->f_flags & O_NONBLOCK) && !pipe->readers)
goto err;
- filp->f_op = &write_fifo_fops;
+ filp->f_op = &write_pipefifo_fops;
pipe->w_counter++;
if (!pipe->writers++)
wake_up_partner(inode);
@@ -105,7 +105,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
* This implementation will NEVER block on a O_RDWR open, since
* the process can at least talk to itself.
*/
- filp->f_op = &rdwr_fifo_fops;
+ filp->f_op = &rdwr_pipefifo_fops;
pipe->readers++;
pipe->writers++;
@@ -151,5 +151,5 @@ err_nocleanup:
* depending on the access mode of the file...
*/
const struct file_operations def_fifo_fops = {
- .open = fifo_open, /* will set read or write pipe_fops */
+ .open = fifo_open, /* will set read_ or write_pipefifo_fops */
};
diff --git a/fs/file.c b/fs/file.c
index 7b3887e054d..f313314f996 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -6,6 +6,7 @@
* Manage the dynamic fd arrays in the process files_struct.
*/
+#include <linux/module.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/time.h>
@@ -250,9 +251,18 @@ int expand_files(struct files_struct *files, int nr)
struct fdtable *fdt;
fdt = files_fdtable(files);
+
+ /*
+ * N.B. For clone tasks sharing a files structure, this test
+ * will limit the total number of files that can be opened.
+ */
+ if (nr >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
+ return -EMFILE;
+
/* Do we need to expand? */
if (nr < fdt->max_fds)
return 0;
+
/* Can we expand? */
if (nr >= sysctl_nr_open)
return -EMFILE;
@@ -423,3 +433,63 @@ struct files_struct init_files = {
},
.file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock),
};
+
+/*
+ * allocate a file descriptor, mark it busy.
+ */
+int alloc_fd(unsigned start, unsigned flags)
+{
+ struct files_struct *files = current->files;
+ unsigned int fd;
+ int error;
+ struct fdtable *fdt;
+
+ spin_lock(&files->file_lock);
+repeat:
+ fdt = files_fdtable(files);
+ fd = start;
+ if (fd < files->next_fd)
+ fd = files->next_fd;
+
+ if (fd < fdt->max_fds)
+ fd = find_next_zero_bit(fdt->open_fds->fds_bits,
+ fdt->max_fds, fd);
+
+ error = expand_files(files, fd);
+ if (error < 0)
+ goto out;
+
+ /*
+ * If we needed to expand the fs array we
+ * might have blocked - try again.
+ */
+ if (error)
+ goto repeat;
+
+ if (start <= files->next_fd)
+ files->next_fd = fd + 1;
+
+ FD_SET(fd, fdt->open_fds);
+ if (flags & O_CLOEXEC)
+ FD_SET(fd, fdt->close_on_exec);
+ else
+ FD_CLR(fd, fdt->close_on_exec);
+ error = fd;
+#if 1
+ /* Sanity check */
+ if (rcu_dereference(fdt->fd[fd]) != NULL) {
+ printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
+ rcu_assign_pointer(fdt->fd[fd], NULL);
+ }
+#endif
+
+out:
+ spin_unlock(&files->file_lock);
+ return error;
+}
+
+int get_unused_fd(void)
+{
+ return alloc_fd(0, 0);
+}
+EXPORT_SYMBOL(get_unused_fd);
diff --git a/fs/file_table.c b/fs/file_table.c
index 83084225b4c..f45a4493f9e 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -120,7 +120,7 @@ struct file *get_empty_filp(void)
tsk = current;
INIT_LIST_HEAD(&f->f_u.fu_list);
- atomic_set(&f->f_count, 1);
+ atomic_long_set(&f->f_count, 1);
rwlock_init(&f->f_owner.lock);
f->f_uid = tsk->fsuid;
f->f_gid = tsk->fsgid;
@@ -219,7 +219,7 @@ EXPORT_SYMBOL(init_file);
void fput(struct file *file)
{
- if (atomic_dec_and_test(&file->f_count))
+ if (atomic_long_dec_and_test(&file->f_count))
__fput(file);
}
@@ -294,7 +294,7 @@ struct file *fget(unsigned int fd)
rcu_read_lock();
file = fcheck_files(files, fd);
if (file) {
- if (!atomic_inc_not_zero(&file->f_count)) {
+ if (!atomic_long_inc_not_zero(&file->f_count)) {
/* File object ref couldn't be taken */
rcu_read_unlock();
return NULL;
@@ -326,7 +326,7 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
rcu_read_lock();
file = fcheck_files(files, fd);
if (file) {
- if (atomic_inc_not_zero(&file->f_count))
+ if (atomic_long_inc_not_zero(&file->f_count))
*fput_needed = 1;
else
/* Didn't get the reference, someone's freed */
@@ -341,7 +341,7 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
void put_filp(struct file *file)
{
- if (atomic_dec_and_test(&file->f_count)) {
+ if (atomic_long_dec_and_test(&file->f_count)) {
security_file_free(file);
file_kill(file);
file_free(file);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index ae45f77765c..25adfc3c693 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -424,8 +424,6 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
* WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so
* that it can be located for waiting on in __writeback_single_inode().
*
- * Called under inode_lock.
- *
* If `bdi' is non-zero then we're being asked to writeback a specific queue.
* This function assumes that the blockdev superblock's inodes are backed by
* a variety of queues, so all inodes are searched. For other superblocks,
@@ -441,11 +439,12 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
* on the writer throttling path, and we get decent balancing between many
* throttled threads: we don't want them all piling up on inode_sync_wait.
*/
-static void
-sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
+void generic_sync_sb_inodes(struct super_block *sb,
+ struct writeback_control *wbc)
{
const unsigned long start = jiffies; /* livelock avoidance */
+ spin_lock(&inode_lock);
if (!wbc->for_kupdate || list_empty(&sb->s_io))
queue_io(sb, wbc->older_than_this);
@@ -524,8 +523,16 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
if (!list_empty(&sb->s_more_io))
wbc->more_io = 1;
}
+ spin_unlock(&inode_lock);
return; /* Leave any unwritten inodes on s_io */
}
+EXPORT_SYMBOL_GPL(generic_sync_sb_inodes);
+
+static void sync_sb_inodes(struct super_block *sb,
+ struct writeback_control *wbc)
+{
+ generic_sync_sb_inodes(sb, wbc);
+}
/*
* Start writeback of dirty pagecache data against all unlocked inodes.
@@ -565,11 +572,8 @@ restart:
* be unmounted by the time it is released.
*/
if (down_read_trylock(&sb->s_umount)) {
- if (sb->s_root) {
- spin_lock(&inode_lock);
+ if (sb->s_root)
sync_sb_inodes(sb, wbc);
- spin_unlock(&inode_lock);
- }
up_read(&sb->s_umount);
}
spin_lock(&sb_lock);
@@ -607,9 +611,7 @@ void sync_inodes_sb(struct super_block *sb, int wait)
(inodes_stat.nr_inodes - inodes_stat.nr_unused) +
nr_dirty + nr_unstable;
wbc.nr_to_write += wbc.nr_to_write / 2; /* Bit more for luck */
- spin_lock(&inode_lock);
sync_sb_inodes(sb, &wbc);
- spin_unlock(&inode_lock);
}
/*
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 2060bf06b90..fd03330cade 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -97,7 +97,7 @@ void fuse_invalidate_attr(struct inode *inode)
* timeout is unknown (unlink, rmdir, rename and in some cases
* lookup)
*/
-static void fuse_invalidate_entry_cache(struct dentry *entry)
+void fuse_invalidate_entry_cache(struct dentry *entry)
{
fuse_dentry_settime(entry, 0);
}
@@ -112,18 +112,16 @@ static void fuse_invalidate_entry(struct dentry *entry)
fuse_invalidate_entry_cache(entry);
}
-static void fuse_lookup_init(struct fuse_req *req, struct inode *dir,
- struct dentry *entry,
+static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_req *req,
+ u64 nodeid, struct qstr *name,
struct fuse_entry_out *outarg)
{
- struct fuse_conn *fc = get_fuse_conn(dir);
-
memset(outarg, 0, sizeof(struct fuse_entry_out));
req->in.h.opcode = FUSE_LOOKUP;
- req->in.h.nodeid = get_node_id(dir);
+ req->in.h.nodeid = nodeid;
req->in.numargs = 1;
- req->in.args[0].size = entry->d_name.len + 1;
- req->in.args[0].value = entry->d_name.name;
+ req->in.args[0].size = name->len + 1;
+ req->in.args[0].value = name->name;
req->out.numargs = 1;
if (fc->minor < 9)
req->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE;
@@ -189,7 +187,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
attr_version = fuse_get_attr_version(fc);
parent = dget_parent(entry);
- fuse_lookup_init(req, parent->d_inode, entry, &outarg);
+ fuse_lookup_init(fc, req, get_node_id(parent->d_inode),
+ &entry->d_name, &outarg);
request_send(fc, req);
dput(parent);
err = req->out.h.error;
@@ -225,7 +224,7 @@ static int invalid_nodeid(u64 nodeid)
return !nodeid || nodeid == FUSE_ROOT_ID;
}
-static struct dentry_operations fuse_dentry_operations = {
+struct dentry_operations fuse_dentry_operations = {
.d_revalidate = fuse_dentry_revalidate,
};
@@ -239,85 +238,127 @@ int fuse_valid_type(int m)
* Add a directory inode to a dentry, ensuring that no other dentry
* refers to this inode. Called with fc->inst_mutex.
*/
-static int fuse_d_add_directory(struct dentry *entry, struct inode *inode)
+static struct dentry *fuse_d_add_directory(struct dentry *entry,
+ struct inode *inode)
{
struct dentry *alias = d_find_alias(inode);
- if (alias) {
+ if (alias && !(alias->d_flags & DCACHE_DISCONNECTED)) {
/* This tries to shrink the subtree below alias */
fuse_invalidate_entry(alias);
dput(alias);
if (!list_empty(&inode->i_dentry))
- return -EBUSY;
+ return ERR_PTR(-EBUSY);
+ } else {
+ dput(alias);
}
- d_add(entry, inode);
- return 0;
+ return d_splice_alias(inode, entry);
}
-static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
- struct nameidata *nd)
+int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
+ struct fuse_entry_out *outarg, struct inode **inode)
{
- int err;
- struct fuse_entry_out outarg;
- struct inode *inode = NULL;
- struct fuse_conn *fc = get_fuse_conn(dir);
+ struct fuse_conn *fc = get_fuse_conn_super(sb);
struct fuse_req *req;
struct fuse_req *forget_req;
u64 attr_version;
+ int err;
- if (entry->d_name.len > FUSE_NAME_MAX)
- return ERR_PTR(-ENAMETOOLONG);
+ *inode = NULL;
+ err = -ENAMETOOLONG;
+ if (name->len > FUSE_NAME_MAX)
+ goto out;
req = fuse_get_req(fc);
+ err = PTR_ERR(req);
if (IS_ERR(req))
- return ERR_CAST(req);
+ goto out;
forget_req = fuse_get_req(fc);
+ err = PTR_ERR(forget_req);
if (IS_ERR(forget_req)) {
fuse_put_request(fc, req);
- return ERR_CAST(forget_req);
+ goto out;
}
attr_version = fuse_get_attr_version(fc);
- fuse_lookup_init(req, dir, entry, &outarg);
+ fuse_lookup_init(fc, req, nodeid, name, outarg);
request_send(fc, req);
err = req->out.h.error;
fuse_put_request(fc, req);
/* Zero nodeid is same as -ENOENT, but with valid timeout */
- if (!err && outarg.nodeid &&
- (invalid_nodeid(outarg.nodeid) ||
- !fuse_valid_type(outarg.attr.mode)))
- err = -EIO;
- if (!err && outarg.nodeid) {
- inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
- &outarg.attr, entry_attr_timeout(&outarg),
- attr_version);
- if (!inode) {
- fuse_send_forget(fc, forget_req, outarg.nodeid, 1);
- return ERR_PTR(-ENOMEM);
- }
+ if (err || !outarg->nodeid)
+ goto out_put_forget;
+
+ err = -EIO;
+ if (!outarg->nodeid)
+ goto out_put_forget;
+ if (!fuse_valid_type(outarg->attr.mode))
+ goto out_put_forget;
+
+ *inode = fuse_iget(sb, outarg->nodeid, outarg->generation,
+ &outarg->attr, entry_attr_timeout(outarg),
+ attr_version);
+ err = -ENOMEM;
+ if (!*inode) {
+ fuse_send_forget(fc, forget_req, outarg->nodeid, 1);
+ goto out;
}
+ err = 0;
+
+ out_put_forget:
fuse_put_request(fc, forget_req);
- if (err && err != -ENOENT)
- return ERR_PTR(err);
+ out:
+ return err;
+}
+
+static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
+ struct nameidata *nd)
+{
+ int err;
+ struct fuse_entry_out outarg;
+ struct inode *inode;
+ struct dentry *newent;
+ struct fuse_conn *fc = get_fuse_conn(dir);
+ bool outarg_valid = true;
+
+ err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name,
+ &outarg, &inode);
+ if (err == -ENOENT) {
+ outarg_valid = false;
+ err = 0;
+ }
+ if (err)
+ goto out_err;
+
+ err = -EIO;
+ if (inode && get_node_id(inode) == FUSE_ROOT_ID)
+ goto out_iput;
if (inode && S_ISDIR(inode->i_mode)) {
mutex_lock(&fc->inst_mutex);
- err = fuse_d_add_directory(entry, inode);
+ newent = fuse_d_add_directory(entry, inode);
mutex_unlock(&fc->inst_mutex);
- if (err) {
- iput(inode);
- return ERR_PTR(err);
- }
- } else
- d_add(entry, inode);
+ err = PTR_ERR(newent);
+ if (IS_ERR(newent))
+ goto out_iput;
+ } else {
+ newent = d_splice_alias(inode, entry);
+ }
+ entry = newent ? newent : entry;
entry->d_op = &fuse_dentry_operations;
- if (!err)
+ if (outarg_valid)
fuse_change_entry_timeout(entry, &outarg);
else
fuse_invalidate_entry_cache(entry);
- return NULL;
+
+ return newent;
+
+ out_iput:
+ iput(inode);
+ out_err:
+ return ERR_PTR(err);
}
/*
@@ -857,7 +898,7 @@ static int fuse_access(struct inode *inode, int mask)
return PTR_ERR(req);
memset(&inarg, 0, sizeof(inarg));
- inarg.mask = mask;
+ inarg.mask = mask & (MAY_READ | MAY_WRITE | MAY_EXEC);
req->in.h.opcode = FUSE_ACCESS;
req->in.h.nodeid = get_node_id(inode);
req->in.numargs = 1;
@@ -886,7 +927,7 @@ static int fuse_access(struct inode *inode, int mask)
* access request is sent. Execute permission is still checked
* locally based on file mode.
*/
-static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd)
+static int fuse_permission(struct inode *inode, int mask)
{
struct fuse_conn *fc = get_fuse_conn(inode);
bool refreshed = false;
@@ -921,7 +962,7 @@ static int fuse_permission(struct inode *inode, int mask, struct nameidata *nd)
exist. So if permissions are revoked this won't be
noticed immediately, only after the attribute
timeout has expired */
- } else if (nd && (nd->flags & (LOOKUP_ACCESS | LOOKUP_CHDIR))) {
+ } else if (mask & MAY_ACCESS) {
err = fuse_access(inode, mask);
} else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) {
if (!(inode->i_mode & S_IXUGO)) {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 8092f0d9fd1..2bada6bbc31 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -893,7 +893,7 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
if (count == 0)
goto out;
- err = remove_suid(file->f_path.dentry);
+ err = file_remove_suid(file);
if (err)
goto out;
@@ -1341,6 +1341,11 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
pid_t pid = fl->fl_type != F_UNLCK ? current->tgid : 0;
int err;
+ if (fl->fl_lmops && fl->fl_lmops->fl_grant) {
+ /* NLM needs asynchronous locks, which we don't support yet */
+ return -ENOLCK;
+ }
+
/* Unlock on close is handled by the flush method */
if (fl->fl_flags & FL_CLOSE)
return 0;
@@ -1365,7 +1370,9 @@ static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
struct fuse_conn *fc = get_fuse_conn(inode);
int err;
- if (cmd == F_GETLK) {
+ if (cmd == F_CANCELLK) {
+ err = 0;
+ } else if (cmd == F_GETLK) {
if (fc->no_lock) {
posix_test_lock(file, fl);
err = 0;
@@ -1373,7 +1380,7 @@ static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
err = fuse_getlk(file, fl);
} else {
if (fc->no_lock)
- err = posix_lock_file_wait(file, fl);
+ err = posix_lock_file(file, fl, NULL);
else
err = fuse_setlk(file, fl, 0);
}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index bae948657c4..3a876076bdd 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -363,6 +363,9 @@ struct fuse_conn {
/** Do not send separate SETATTR request before open(O_TRUNC) */
unsigned atomic_o_trunc : 1;
+ /** Filesystem supports NFS exporting. Only set in INIT */
+ unsigned export_support : 1;
+
/*
* The following bitfields are only for optimization purposes
* and hence races in setting them will not cause malfunction
@@ -464,6 +467,8 @@ static inline u64 get_node_id(struct inode *inode)
/** Device operations */
extern const struct file_operations fuse_dev_operations;
+extern struct dentry_operations fuse_dentry_operations;
+
/**
* Get a filled in inode
*/
@@ -471,6 +476,9 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
int generation, struct fuse_attr *attr,
u64 attr_valid, u64 attr_version);
+int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
+ struct fuse_entry_out *outarg, struct inode **inode);
+
/**
* Send FORGET command
*/
@@ -604,6 +612,8 @@ void fuse_abort_conn(struct fuse_conn *fc);
*/
void fuse_invalidate_attr(struct inode *inode);
+void fuse_invalidate_entry_cache(struct dentry *entry);
+
/**
* Acquire reference to fuse_conn
*/
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 3141690558c..6a84388cacf 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -18,6 +18,7 @@
#include <linux/statfs.h>
#include <linux/random.h>
#include <linux/sched.h>
+#include <linux/exportfs.h>
MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
MODULE_DESCRIPTION("Filesystem in Userspace");
@@ -353,7 +354,7 @@ enum {
OPT_ERR
};
-static match_table_t tokens = {
+static const match_table_t tokens = {
{OPT_FD, "fd=%u"},
{OPT_ROOTMODE, "rootmode=%o"},
{OPT_USER_ID, "user_id=%u"},
@@ -552,6 +553,174 @@ static struct inode *get_root_inode(struct super_block *sb, unsigned mode)
return fuse_iget(sb, 1, 0, &attr, 0, 0);
}
+struct fuse_inode_handle
+{
+ u64 nodeid;
+ u32 generation;
+};
+
+static struct dentry *fuse_get_dentry(struct super_block *sb,
+ struct fuse_inode_handle *handle)
+{
+ struct fuse_conn *fc = get_fuse_conn_super(sb);
+ struct inode *inode;
+ struct dentry *entry;
+ int err = -ESTALE;
+
+ if (handle->nodeid == 0)
+ goto out_err;
+
+ inode = ilookup5(sb, handle->nodeid, fuse_inode_eq, &handle->nodeid);
+ if (!inode) {
+ struct fuse_entry_out outarg;
+ struct qstr name;
+
+ if (!fc->export_support)
+ goto out_err;
+
+ name.len = 1;
+ name.name = ".";
+ err = fuse_lookup_name(sb, handle->nodeid, &name, &outarg,
+ &inode);
+ if (err && err != -ENOENT)
+ goto out_err;
+ if (err || !inode) {
+ err = -ESTALE;
+ goto out_err;
+ }
+ err = -EIO;
+ if (get_node_id(inode) != handle->nodeid)
+ goto out_iput;
+ }
+ err = -ESTALE;
+ if (inode->i_generation != handle->generation)
+ goto out_iput;
+
+ entry = d_alloc_anon(inode);
+ err = -ENOMEM;
+ if (!entry)
+ goto out_iput;
+
+ if (get_node_id(inode) != FUSE_ROOT_ID) {
+ entry->d_op = &fuse_dentry_operations;
+ fuse_invalidate_entry_cache(entry);
+ }
+
+ return entry;
+
+ out_iput:
+ iput(inode);
+ out_err:
+ return ERR_PTR(err);
+}
+
+static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
+ int connectable)
+{
+ struct inode *inode = dentry->d_inode;
+ bool encode_parent = connectable && !S_ISDIR(inode->i_mode);
+ int len = encode_parent ? 6 : 3;
+ u64 nodeid;
+ u32 generation;
+
+ if (*max_len < len)
+ return 255;
+
+ nodeid = get_fuse_inode(inode)->nodeid;
+ generation = inode->i_generation;
+
+ fh[0] = (u32)(nodeid >> 32);
+ fh[1] = (u32)(nodeid & 0xffffffff);
+ fh[2] = generation;
+
+ if (encode_parent) {
+ struct inode *parent;
+
+ spin_lock(&dentry->d_lock);
+ parent = dentry->d_parent->d_inode;
+ nodeid = get_fuse_inode(parent)->nodeid;
+ generation = parent->i_generation;
+ spin_unlock(&dentry->d_lock);
+
+ fh[3] = (u32)(nodeid >> 32);
+ fh[4] = (u32)(nodeid & 0xffffffff);
+ fh[5] = generation;
+ }
+
+ *max_len = len;
+ return encode_parent ? 0x82 : 0x81;
+}
+
+static struct dentry *fuse_fh_to_dentry(struct super_block *sb,
+ struct fid *fid, int fh_len, int fh_type)
+{
+ struct fuse_inode_handle handle;
+
+ if ((fh_type != 0x81 && fh_type != 0x82) || fh_len < 3)
+ return NULL;
+
+ handle.nodeid = (u64) fid->raw[0] << 32;
+ handle.nodeid |= (u64) fid->raw[1];
+ handle.generation = fid->raw[2];
+ return fuse_get_dentry(sb, &handle);
+}
+
+static struct dentry *fuse_fh_to_parent(struct super_block *sb,
+ struct fid *fid, int fh_len, int fh_type)
+{
+ struct fuse_inode_handle parent;
+
+ if (fh_type != 0x82 || fh_len < 6)
+ return NULL;
+
+ parent.nodeid = (u64) fid->raw[3] << 32;
+ parent.nodeid |= (u64) fid->raw[4];
+ parent.generation = fid->raw[5];
+ return fuse_get_dentry(sb, &parent);
+}
+
+static struct dentry *fuse_get_parent(struct dentry *child)
+{
+ struct inode *child_inode = child->d_inode;
+ struct fuse_conn *fc = get_fuse_conn(child_inode);
+ struct inode *inode;
+ struct dentry *parent;
+ struct fuse_entry_out outarg;
+ struct qstr name;
+ int err;
+
+ if (!fc->export_support)
+ return ERR_PTR(-ESTALE);
+
+ name.len = 2;
+ name.name = "..";
+ err = fuse_lookup_name(child_inode->i_sb, get_node_id(child_inode),
+ &name, &outarg, &inode);
+ if (err && err != -ENOENT)
+ return ERR_PTR(err);
+ if (err || !inode)
+ return ERR_PTR(-ESTALE);
+
+ parent = d_alloc_anon(inode);
+ if (!parent) {
+ iput(inode);
+ return ERR_PTR(-ENOMEM);
+ }
+ if (get_node_id(inode) != FUSE_ROOT_ID) {
+ parent->d_op = &fuse_dentry_operations;
+ fuse_invalidate_entry_cache(parent);
+ }
+
+ return parent;
+}
+
+static const struct export_operations fuse_export_operations = {
+ .fh_to_dentry = fuse_fh_to_dentry,
+ .fh_to_parent = fuse_fh_to_parent,
+ .encode_fh = fuse_encode_fh,
+ .get_parent = fuse_get_parent,
+};
+
static const struct super_operations fuse_super_operations = {
.alloc_inode = fuse_alloc_inode,
.destroy_inode = fuse_destroy_inode,
@@ -581,6 +750,11 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
fc->no_lock = 1;
if (arg->flags & FUSE_ATOMIC_O_TRUNC)
fc->atomic_o_trunc = 1;
+ if (arg->minor >= 9) {
+ /* LOOKUP has dependency on proto version */
+ if (arg->flags & FUSE_EXPORT_SUPPORT)
+ fc->export_support = 1;
+ }
if (arg->flags & FUSE_BIG_WRITES)
fc->big_writes = 1;
} else {
@@ -607,7 +781,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
arg->minor = FUSE_KERNEL_MINOR_VERSION;
arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE;
arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
- FUSE_BIG_WRITES;
+ FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES;
req->in.h.opcode = FUSE_INIT;
req->in.numargs = 1;
req->in.args[0].size = sizeof(*arg);
@@ -652,6 +826,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
sb->s_magic = FUSE_SUPER_MAGIC;
sb->s_op = &fuse_super_operations;
sb->s_maxbytes = MAX_LFS_FILESIZE;
+ sb->s_export_op = &fuse_export_operations;
file = fget(d.fd);
if (!file)
@@ -781,7 +956,7 @@ static inline void unregister_fuseblk(void)
}
#endif
-static void fuse_inode_init_once(struct kmem_cache *cachep, void *foo)
+static void fuse_inode_init_once(void *foo)
{
struct inode * inode = foo;
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 7f7947e3dfb..ab2f57e3fb8 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -14,23 +14,11 @@ config GFS2_FS
GFS is perfect consistency -- changes made to the filesystem on one
machine show up immediately on all other machines in the cluster.
- To use the GFS2 filesystem, you will need to enable one or more of
- the below locking modules. Documentation and utilities for GFS2 can
+ To use the GFS2 filesystem in a cluster, you will need to enable
+ the locking module below. Documentation and utilities for GFS2 can
be found here: http://sources.redhat.com/cluster
-config GFS2_FS_LOCKING_NOLOCK
- tristate "GFS2 \"nolock\" locking module"
- depends on GFS2_FS
- help
- Single node locking module for GFS2.
-
- Use this module if you want to use GFS2 on a single node without
- its clustering features. You can still take advantage of the
- large file support, and upgrade to running a full cluster later on
- if required.
-
- If you will only be using GFS2 in cluster mode, you do not need this
- module.
+ The "nolock" lock module is now built in to GFS2 by default.
config GFS2_FS_LOCKING_DLM
tristate "GFS2 DLM locking module"
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index e2350df02a0..ec65851ec80 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -5,6 +5,5 @@ gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \
ops_fstype.o ops_inode.o ops_super.o quota.o \
recovery.o rgrp.o super.o sys.o trans.o util.o
-obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += locking/nolock/
obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += locking/dlm/
diff --git a/fs/gfs2/gfs2.h b/fs/gfs2/gfs2.h
index 3bb11c0f8b5..ef606e3a5cf 100644
--- a/fs/gfs2/gfs2.h
+++ b/fs/gfs2/gfs2.h
@@ -16,11 +16,6 @@ enum {
};
enum {
- NO_WAIT = 0,
- WAIT = 1,
-};
-
-enum {
NO_FORCE = 0,
FORCE = 1,
};
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index d636b3e80f5..c962283d4e7 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -45,21 +45,19 @@ struct gfs2_gl_hash_bucket {
struct hlist_head hb_list;
};
-struct glock_iter {
- int hash; /* hash bucket index */
- struct gfs2_sbd *sdp; /* incore superblock */
- struct gfs2_glock *gl; /* current glock struct */
- struct seq_file *seq; /* sequence file for debugfs */
- char string[512]; /* scratch space */
+struct gfs2_glock_iter {
+ int hash; /* hash bucket index */
+ struct gfs2_sbd *sdp; /* incore superblock */
+ struct gfs2_glock *gl; /* current glock struct */
+ char string[512]; /* scratch space */
};
typedef void (*glock_examiner) (struct gfs2_glock * gl);
static int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
-static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl);
-static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh);
-static void gfs2_glock_drop_th(struct gfs2_glock *gl);
-static void run_queue(struct gfs2_glock *gl);
+static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
+#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0)
+static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
static DECLARE_RWSEM(gfs2_umount_flush_sem);
static struct dentry *gfs2_root;
@@ -123,33 +121,6 @@ static inline rwlock_t *gl_lock_addr(unsigned int x)
#endif
/**
- * relaxed_state_ok - is a requested lock compatible with the current lock mode?
- * @actual: the current state of the lock
- * @requested: the lock state that was requested by the caller
- * @flags: the modifier flags passed in by the caller
- *
- * Returns: 1 if the locks are compatible, 0 otherwise
- */
-
-static inline int relaxed_state_ok(unsigned int actual, unsigned requested,
- int flags)
-{
- if (actual == requested)
- return 1;
-
- if (flags & GL_EXACT)
- return 0;
-
- if (actual == LM_ST_EXCLUSIVE && requested == LM_ST_SHARED)
- return 1;
-
- if (actual != LM_ST_UNLOCKED && (flags & LM_FLAG_ANY))
- return 1;
-
- return 0;
-}
-
-/**
* gl_hash() - Turn glock number into hash bucket number
* @lock: The glock number
*
@@ -182,7 +153,7 @@ static void glock_free(struct gfs2_glock *gl)
struct gfs2_sbd *sdp = gl->gl_sbd;
struct inode *aspace = gl->gl_aspace;
- if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ if (sdp->sd_lockstruct.ls_ops->lm_put_lock)
sdp->sd_lockstruct.ls_ops->lm_put_lock(gl->gl_lock);
if (aspace)
@@ -211,17 +182,14 @@ static void gfs2_glock_hold(struct gfs2_glock *gl)
int gfs2_glock_put(struct gfs2_glock *gl)
{
int rv = 0;
- struct gfs2_sbd *sdp = gl->gl_sbd;
write_lock(gl_lock_addr(gl->gl_hash));
if (atomic_dec_and_test(&gl->gl_ref)) {
hlist_del(&gl->gl_list);
write_unlock(gl_lock_addr(gl->gl_hash));
- gfs2_assert(sdp, gl->gl_state == LM_ST_UNLOCKED);
- gfs2_assert(sdp, list_empty(&gl->gl_reclaim));
- gfs2_assert(sdp, list_empty(&gl->gl_holders));
- gfs2_assert(sdp, list_empty(&gl->gl_waiters1));
- gfs2_assert(sdp, list_empty(&gl->gl_waiters3));
+ GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_UNLOCKED);
+ GLOCK_BUG_ON(gl, !list_empty(&gl->gl_reclaim));
+ GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
glock_free(gl);
rv = 1;
goto out;
@@ -281,22 +249,401 @@ static struct gfs2_glock *gfs2_glock_find(const struct gfs2_sbd *sdp,
return gl;
}
+/**
+ * may_grant - check if its ok to grant a new lock
+ * @gl: The glock
+ * @gh: The lock request which we wish to grant
+ *
+ * Returns: true if its ok to grant the lock
+ */
+
+static inline int may_grant(const struct gfs2_glock *gl, const struct gfs2_holder *gh)
+{
+ const struct gfs2_holder *gh_head = list_entry(gl->gl_holders.next, const struct gfs2_holder, gh_list);
+ if ((gh->gh_state == LM_ST_EXCLUSIVE ||
+ gh_head->gh_state == LM_ST_EXCLUSIVE) && gh != gh_head)
+ return 0;
+ if (gl->gl_state == gh->gh_state)
+ return 1;
+ if (gh->gh_flags & GL_EXACT)
+ return 0;
+ if (gl->gl_state == LM_ST_EXCLUSIVE) {
+ if (gh->gh_state == LM_ST_SHARED && gh_head->gh_state == LM_ST_SHARED)
+ return 1;
+ if (gh->gh_state == LM_ST_DEFERRED && gh_head->gh_state == LM_ST_DEFERRED)
+ return 1;
+ }
+ if (gl->gl_state != LM_ST_UNLOCKED && (gh->gh_flags & LM_FLAG_ANY))
+ return 1;
+ return 0;
+}
+
+static void gfs2_holder_wake(struct gfs2_holder *gh)
+{
+ clear_bit(HIF_WAIT, &gh->gh_iflags);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&gh->gh_iflags, HIF_WAIT);
+}
+
+/**
+ * do_promote - promote as many requests as possible on the current queue
+ * @gl: The glock
+ *
+ * Returns: true if there is a blocked holder at the head of the list
+ */
+
+static int do_promote(struct gfs2_glock *gl)
+{
+ const struct gfs2_glock_operations *glops = gl->gl_ops;
+ struct gfs2_holder *gh, *tmp;
+ int ret;
+
+restart:
+ list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
+ if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+ continue;
+ if (may_grant(gl, gh)) {
+ if (gh->gh_list.prev == &gl->gl_holders &&
+ glops->go_lock) {
+ spin_unlock(&gl->gl_spin);
+ /* FIXME: eliminate this eventually */
+ ret = glops->go_lock(gh);
+ spin_lock(&gl->gl_spin);
+ if (ret) {
+ gh->gh_error = ret;
+ list_del_init(&gh->gh_list);
+ gfs2_holder_wake(gh);
+ goto restart;
+ }
+ set_bit(HIF_HOLDER, &gh->gh_iflags);
+ gfs2_holder_wake(gh);
+ goto restart;
+ }
+ set_bit(HIF_HOLDER, &gh->gh_iflags);
+ gfs2_holder_wake(gh);
+ continue;
+ }
+ if (gh->gh_list.prev == &gl->gl_holders)
+ return 1;
+ break;
+ }
+ return 0;
+}
+
+/**
+ * do_error - Something unexpected has happened during a lock request
+ *
+ */
+
+static inline void do_error(struct gfs2_glock *gl, const int ret)
+{
+ struct gfs2_holder *gh, *tmp;
+
+ list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
+ if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+ continue;
+ if (ret & LM_OUT_ERROR)
+ gh->gh_error = -EIO;
+ else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))
+ gh->gh_error = GLR_TRYFAILED;
+ else
+ continue;
+ list_del_init(&gh->gh_list);
+ gfs2_holder_wake(gh);
+ }
+}
+
+/**
+ * find_first_waiter - find the first gh that's waiting for the glock
+ * @gl: the glock
+ */
+
+static inline struct gfs2_holder *find_first_waiter(const struct gfs2_glock *gl)
+{
+ struct gfs2_holder *gh;
+
+ list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+ if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
+ return gh;
+ }
+ return NULL;
+}
+
+/**
+ * state_change - record that the glock is now in a different state
+ * @gl: the glock
+ * @new_state the new state
+ *
+ */
+
+static void state_change(struct gfs2_glock *gl, unsigned int new_state)
+{
+ int held1, held2;
+
+ held1 = (gl->gl_state != LM_ST_UNLOCKED);
+ held2 = (new_state != LM_ST_UNLOCKED);
+
+ if (held1 != held2) {
+ if (held2)
+ gfs2_glock_hold(gl);
+ else
+ gfs2_glock_put(gl);
+ }
+
+ gl->gl_state = new_state;
+ gl->gl_tchange = jiffies;
+}
+
+static void gfs2_demote_wake(struct gfs2_glock *gl)
+{
+ gl->gl_demote_state = LM_ST_EXCLUSIVE;
+ clear_bit(GLF_DEMOTE, &gl->gl_flags);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&gl->gl_flags, GLF_DEMOTE);
+}
+
+/**
+ * finish_xmote - The DLM has replied to one of our lock requests
+ * @gl: The glock
+ * @ret: The status from the DLM
+ *
+ */
+
+static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
+{
+ const struct gfs2_glock_operations *glops = gl->gl_ops;
+ struct gfs2_holder *gh;
+ unsigned state = ret & LM_OUT_ST_MASK;
+
+ spin_lock(&gl->gl_spin);
+ state_change(gl, state);
+ gh = find_first_waiter(gl);
+
+ /* Demote to UN request arrived during demote to SH or DF */
+ if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
+ state != LM_ST_UNLOCKED && gl->gl_demote_state == LM_ST_UNLOCKED)
+ gl->gl_target = LM_ST_UNLOCKED;
+
+ /* Check for state != intended state */
+ if (unlikely(state != gl->gl_target)) {
+ if (gh && !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) {
+ /* move to back of queue and try next entry */
+ if (ret & LM_OUT_CANCELED) {
+ if ((gh->gh_flags & LM_FLAG_PRIORITY) == 0)
+ list_move_tail(&gh->gh_list, &gl->gl_holders);
+ gh = find_first_waiter(gl);
+ gl->gl_target = gh->gh_state;
+ goto retry;
+ }
+ /* Some error or failed "try lock" - report it */
+ if ((ret & LM_OUT_ERROR) ||
+ (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) {
+ gl->gl_target = gl->gl_state;
+ do_error(gl, ret);
+ goto out;
+ }
+ }
+ switch(state) {
+ /* Unlocked due to conversion deadlock, try again */
+ case LM_ST_UNLOCKED:
+retry:
+ do_xmote(gl, gh, gl->gl_target);
+ break;
+ /* Conversion fails, unlock and try again */
+ case LM_ST_SHARED:
+ case LM_ST_DEFERRED:
+ do_xmote(gl, gh, LM_ST_UNLOCKED);
+ break;
+ default: /* Everything else */
+ printk(KERN_ERR "GFS2: wanted %u got %u\n", gl->gl_target, state);
+ GLOCK_BUG_ON(gl, 1);
+ }
+ spin_unlock(&gl->gl_spin);
+ gfs2_glock_put(gl);
+ return;
+ }
+
+ /* Fast path - we got what we asked for */
+ if (test_and_clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags))
+ gfs2_demote_wake(gl);
+ if (state != LM_ST_UNLOCKED) {
+ if (glops->go_xmote_bh) {
+ int rv;
+ spin_unlock(&gl->gl_spin);
+ rv = glops->go_xmote_bh(gl, gh);
+ if (rv == -EAGAIN)
+ return;
+ spin_lock(&gl->gl_spin);
+ if (rv) {
+ do_error(gl, rv);
+ goto out;
+ }
+ }
+ do_promote(gl);
+ }
+out:
+ clear_bit(GLF_LOCK, &gl->gl_flags);
+ spin_unlock(&gl->gl_spin);
+ gfs2_glock_put(gl);
+}
+
+static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
+ unsigned int cur_state, unsigned int req_state,
+ unsigned int flags)
+{
+ int ret = LM_OUT_ERROR;
+
+ if (!sdp->sd_lockstruct.ls_ops->lm_lock)
+ return req_state == LM_ST_UNLOCKED ? 0 : req_state;
+
+ if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state,
+ req_state, flags);
+ return ret;
+}
+
+/**
+ * do_xmote - Calls the DLM to change the state of a lock
+ * @gl: The lock state
+ * @gh: The holder (only for promotes)
+ * @target: The target lock state
+ *
+ */
+
+static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target)
+{
+ const struct gfs2_glock_operations *glops = gl->gl_ops;
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ unsigned int lck_flags = gh ? gh->gh_flags : 0;
+ int ret;
+
+ lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
+ LM_FLAG_PRIORITY);
+ BUG_ON(gl->gl_state == target);
+ BUG_ON(gl->gl_state == gl->gl_target);
+ if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) &&
+ glops->go_inval) {
+ set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
+ do_error(gl, 0); /* Fail queued try locks */
+ }
+ spin_unlock(&gl->gl_spin);
+ if (glops->go_xmote_th)
+ glops->go_xmote_th(gl);
+ if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
+ glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA);
+ clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
+
+ gfs2_glock_hold(gl);
+ if (target != LM_ST_UNLOCKED && (gl->gl_state == LM_ST_SHARED ||
+ gl->gl_state == LM_ST_DEFERRED) &&
+ !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
+ lck_flags |= LM_FLAG_TRY_1CB;
+ ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, target, lck_flags);
+
+ if (!(ret & LM_OUT_ASYNC)) {
+ finish_xmote(gl, ret);
+ gfs2_glock_hold(gl);
+ if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+ gfs2_glock_put(gl);
+ } else {
+ GLOCK_BUG_ON(gl, ret != LM_OUT_ASYNC);
+ }
+ spin_lock(&gl->gl_spin);
+}
+
+/**
+ * find_first_holder - find the first "holder" gh
+ * @gl: the glock
+ */
+
+static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
+{
+ struct gfs2_holder *gh;
+
+ if (!list_empty(&gl->gl_holders)) {
+ gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
+ if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+ return gh;
+ }
+ return NULL;
+}
+
+/**
+ * run_queue - do all outstanding tasks related to a glock
+ * @gl: The glock in question
+ * @nonblock: True if we must not block in run_queue
+ *
+ */
+
+static void run_queue(struct gfs2_glock *gl, const int nonblock)
+{
+ struct gfs2_holder *gh = NULL;
+
+ if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
+ return;
+
+ GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags));
+
+ if (test_bit(GLF_DEMOTE, &gl->gl_flags) &&
+ gl->gl_demote_state != gl->gl_state) {
+ if (find_first_holder(gl))
+ goto out;
+ if (nonblock)
+ goto out_sched;
+ set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
+ GLOCK_BUG_ON(gl, gl->gl_demote_state == LM_ST_EXCLUSIVE);
+ gl->gl_target = gl->gl_demote_state;
+ } else {
+ if (test_bit(GLF_DEMOTE, &gl->gl_flags))
+ gfs2_demote_wake(gl);
+ if (do_promote(gl) == 0)
+ goto out;
+ gh = find_first_waiter(gl);
+ gl->gl_target = gh->gh_state;
+ if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
+ do_error(gl, 0); /* Fail queued try locks */
+ }
+ do_xmote(gl, gh, gl->gl_target);
+ return;
+
+out_sched:
+ gfs2_glock_hold(gl);
+ if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+ gfs2_glock_put(gl);
+out:
+ clear_bit(GLF_LOCK, &gl->gl_flags);
+}
+
static void glock_work_func(struct work_struct *work)
{
+ unsigned long delay = 0;
struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work);
+ if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags))
+ finish_xmote(gl, gl->gl_reply);
spin_lock(&gl->gl_spin);
- if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags))
- set_bit(GLF_DEMOTE, &gl->gl_flags);
- run_queue(gl);
+ if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
+ gl->gl_state != LM_ST_UNLOCKED &&
+ gl->gl_demote_state != LM_ST_EXCLUSIVE) {
+ unsigned long holdtime, now = jiffies;
+ holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
+ if (time_before(now, holdtime))
+ delay = holdtime - now;
+ set_bit(delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE, &gl->gl_flags);
+ }
+ run_queue(gl, 0);
spin_unlock(&gl->gl_spin);
- gfs2_glock_put(gl);
+ if (!delay ||
+ queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
+ gfs2_glock_put(gl);
}
static int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
void **lockp)
{
int error = -EIO;
+ if (!sdp->sd_lockstruct.ls_ops->lm_get_lock)
+ return 0;
if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
error = sdp->sd_lockstruct.ls_ops->lm_get_lock(
sdp->sd_lockstruct.ls_lockspace, name, lockp);
@@ -342,12 +689,10 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
gl->gl_name = name;
atomic_set(&gl->gl_ref, 1);
gl->gl_state = LM_ST_UNLOCKED;
+ gl->gl_target = LM_ST_UNLOCKED;
gl->gl_demote_state = LM_ST_EXCLUSIVE;
gl->gl_hash = hash;
- gl->gl_owner_pid = NULL;
- gl->gl_ip = 0;
gl->gl_ops = glops;
- gl->gl_req_gh = NULL;
gl->gl_stamp = jiffies;
gl->gl_tchange = jiffies;
gl->gl_object = NULL;
@@ -447,13 +792,6 @@ void gfs2_holder_uninit(struct gfs2_holder *gh)
gh->gh_ip = 0;
}
-static void gfs2_holder_wake(struct gfs2_holder *gh)
-{
- clear_bit(HIF_WAIT, &gh->gh_iflags);
- smp_mb__after_clear_bit();
- wake_up_bit(&gh->gh_iflags, HIF_WAIT);
-}
-
static int just_schedule(void *word)
{
schedule();
@@ -466,14 +804,6 @@ static void wait_on_holder(struct gfs2_holder *gh)
wait_on_bit(&gh->gh_iflags, HIF_WAIT, just_schedule, TASK_UNINTERRUPTIBLE);
}
-static void gfs2_demote_wake(struct gfs2_glock *gl)
-{
- gl->gl_demote_state = LM_ST_EXCLUSIVE;
- clear_bit(GLF_DEMOTE, &gl->gl_flags);
- smp_mb__after_clear_bit();
- wake_up_bit(&gl->gl_flags, GLF_DEMOTE);
-}
-
static void wait_on_demote(struct gfs2_glock *gl)
{
might_sleep();
@@ -481,217 +811,6 @@ static void wait_on_demote(struct gfs2_glock *gl)
}
/**
- * rq_mutex - process a mutex request in the queue
- * @gh: the glock holder
- *
- * Returns: 1 if the queue is blocked
- */
-
-static int rq_mutex(struct gfs2_holder *gh)
-{
- struct gfs2_glock *gl = gh->gh_gl;
-
- list_del_init(&gh->gh_list);
- /* gh->gh_error never examined. */
- set_bit(GLF_LOCK, &gl->gl_flags);
- clear_bit(HIF_WAIT, &gh->gh_iflags);
- smp_mb();
- wake_up_bit(&gh->gh_iflags, HIF_WAIT);
-
- return 1;
-}
-
-/**
- * rq_promote - process a promote request in the queue
- * @gh: the glock holder
- *
- * Acquire a new inter-node lock, or change a lock state to more restrictive.
- *
- * Returns: 1 if the queue is blocked
- */
-
-static int rq_promote(struct gfs2_holder *gh)
-{
- struct gfs2_glock *gl = gh->gh_gl;
-
- if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
- if (list_empty(&gl->gl_holders)) {
- gl->gl_req_gh = gh;
- set_bit(GLF_LOCK, &gl->gl_flags);
- spin_unlock(&gl->gl_spin);
- gfs2_glock_xmote_th(gh->gh_gl, gh);
- spin_lock(&gl->gl_spin);
- }
- return 1;
- }
-
- if (list_empty(&gl->gl_holders)) {
- set_bit(HIF_FIRST, &gh->gh_iflags);
- set_bit(GLF_LOCK, &gl->gl_flags);
- } else {
- struct gfs2_holder *next_gh;
- if (gh->gh_state == LM_ST_EXCLUSIVE)
- return 1;
- next_gh = list_entry(gl->gl_holders.next, struct gfs2_holder,
- gh_list);
- if (next_gh->gh_state == LM_ST_EXCLUSIVE)
- return 1;
- }
-
- list_move_tail(&gh->gh_list, &gl->gl_holders);
- gh->gh_error = 0;
- set_bit(HIF_HOLDER, &gh->gh_iflags);
-
- gfs2_holder_wake(gh);
-
- return 0;
-}
-
-/**
- * rq_demote - process a demote request in the queue
- * @gh: the glock holder
- *
- * Returns: 1 if the queue is blocked
- */
-
-static int rq_demote(struct gfs2_glock *gl)
-{
- if (!list_empty(&gl->gl_holders))
- return 1;
-
- if (gl->gl_state == gl->gl_demote_state ||
- gl->gl_state == LM_ST_UNLOCKED) {
- gfs2_demote_wake(gl);
- return 0;
- }
-
- set_bit(GLF_LOCK, &gl->gl_flags);
- set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
-
- if (gl->gl_demote_state == LM_ST_UNLOCKED ||
- gl->gl_state != LM_ST_EXCLUSIVE) {
- spin_unlock(&gl->gl_spin);
- gfs2_glock_drop_th(gl);
- } else {
- spin_unlock(&gl->gl_spin);
- gfs2_glock_xmote_th(gl, NULL);
- }
-
- spin_lock(&gl->gl_spin);
- clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
-
- return 0;
-}
-
-/**
- * run_queue - process holder structures on a glock
- * @gl: the glock
- *
- */
-static void run_queue(struct gfs2_glock *gl)
-{
- struct gfs2_holder *gh;
- int blocked = 1;
-
- for (;;) {
- if (test_bit(GLF_LOCK, &gl->gl_flags))
- break;
-
- if (!list_empty(&gl->gl_waiters1)) {
- gh = list_entry(gl->gl_waiters1.next,
- struct gfs2_holder, gh_list);
- blocked = rq_mutex(gh);
- } else if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
- blocked = rq_demote(gl);
- if (test_bit(GLF_WAITERS2, &gl->gl_flags) &&
- !blocked) {
- set_bit(GLF_DEMOTE, &gl->gl_flags);
- gl->gl_demote_state = LM_ST_UNLOCKED;
- }
- clear_bit(GLF_WAITERS2, &gl->gl_flags);
- } else if (!list_empty(&gl->gl_waiters3)) {
- gh = list_entry(gl->gl_waiters3.next,
- struct gfs2_holder, gh_list);
- blocked = rq_promote(gh);
- } else
- break;
-
- if (blocked)
- break;
- }
-}
-
-/**
- * gfs2_glmutex_lock - acquire a local lock on a glock
- * @gl: the glock
- *
- * Gives caller exclusive access to manipulate a glock structure.
- */
-
-static void gfs2_glmutex_lock(struct gfs2_glock *gl)
-{
- spin_lock(&gl->gl_spin);
- if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
- struct gfs2_holder gh;
-
- gfs2_holder_init(gl, 0, 0, &gh);
- set_bit(HIF_WAIT, &gh.gh_iflags);
- list_add_tail(&gh.gh_list, &gl->gl_waiters1);
- spin_unlock(&gl->gl_spin);
- wait_on_holder(&gh);
- gfs2_holder_uninit(&gh);
- } else {
- gl->gl_owner_pid = get_pid(task_pid(current));
- gl->gl_ip = (unsigned long)__builtin_return_address(0);
- spin_unlock(&gl->gl_spin);
- }
-}
-
-/**
- * gfs2_glmutex_trylock - try to acquire a local lock on a glock
- * @gl: the glock
- *
- * Returns: 1 if the glock is acquired
- */
-
-static int gfs2_glmutex_trylock(struct gfs2_glock *gl)
-{
- int acquired = 1;
-
- spin_lock(&gl->gl_spin);
- if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
- acquired = 0;
- } else {
- gl->gl_owner_pid = get_pid(task_pid(current));
- gl->gl_ip = (unsigned long)__builtin_return_address(0);
- }
- spin_unlock(&gl->gl_spin);
-
- return acquired;
-}
-
-/**
- * gfs2_glmutex_unlock - release a local lock on a glock
- * @gl: the glock
- *
- */
-
-static void gfs2_glmutex_unlock(struct gfs2_glock *gl)
-{
- struct pid *pid;
-
- spin_lock(&gl->gl_spin);
- clear_bit(GLF_LOCK, &gl->gl_flags);
- pid = gl->gl_owner_pid;
- gl->gl_owner_pid = NULL;
- gl->gl_ip = 0;
- run_queue(gl);
- spin_unlock(&gl->gl_spin);
-
- put_pid(pid);
-}
-
-/**
* handle_callback - process a demote request
* @gl: the glock
* @state: the state the caller wants us to change to
@@ -705,398 +824,45 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
{
int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE;
- spin_lock(&gl->gl_spin);
set_bit(bit, &gl->gl_flags);
if (gl->gl_demote_state == LM_ST_EXCLUSIVE) {
gl->gl_demote_state = state;
gl->gl_demote_time = jiffies;
if (remote && gl->gl_ops->go_type == LM_TYPE_IOPEN &&
- gl->gl_object) {
+ gl->gl_object)
gfs2_glock_schedule_for_reclaim(gl);
- spin_unlock(&gl->gl_spin);
- return;
- }
} else if (gl->gl_demote_state != LM_ST_UNLOCKED &&
gl->gl_demote_state != state) {
- if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags))
- set_bit(GLF_WAITERS2, &gl->gl_flags);
- else
- gl->gl_demote_state = LM_ST_UNLOCKED;
- }
- spin_unlock(&gl->gl_spin);
-}
-
-/**
- * state_change - record that the glock is now in a different state
- * @gl: the glock
- * @new_state the new state
- *
- */
-
-static void state_change(struct gfs2_glock *gl, unsigned int new_state)
-{
- int held1, held2;
-
- held1 = (gl->gl_state != LM_ST_UNLOCKED);
- held2 = (new_state != LM_ST_UNLOCKED);
-
- if (held1 != held2) {
- if (held2)
- gfs2_glock_hold(gl);
- else
- gfs2_glock_put(gl);
- }
-
- gl->gl_state = new_state;
- gl->gl_tchange = jiffies;
-}
-
-/**
- * drop_bh - Called after a lock module unlock completes
- * @gl: the glock
- * @ret: the return status
- *
- * Doesn't wake up the process waiting on the struct gfs2_holder (if any)
- * Doesn't drop the reference on the glock the top half took out
- *
- */
-
-static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
-{
- struct gfs2_sbd *sdp = gl->gl_sbd;
- struct gfs2_holder *gh = gl->gl_req_gh;
-
- gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
- gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
- gfs2_assert_warn(sdp, !ret);
-
- state_change(gl, LM_ST_UNLOCKED);
-
- if (test_and_clear_bit(GLF_CONV_DEADLK, &gl->gl_flags)) {
- spin_lock(&gl->gl_spin);
- gh->gh_error = 0;
- spin_unlock(&gl->gl_spin);
- gfs2_glock_xmote_th(gl, gl->gl_req_gh);
- gfs2_glock_put(gl);
- return;
+ gl->gl_demote_state = LM_ST_UNLOCKED;
}
-
- spin_lock(&gl->gl_spin);
- gfs2_demote_wake(gl);
- clear_bit(GLF_LOCK, &gl->gl_flags);
- spin_unlock(&gl->gl_spin);
- gfs2_glock_put(gl);
}
/**
- * xmote_bh - Called after the lock module is done acquiring a lock
- * @gl: The glock in question
- * @ret: the int returned from the lock module
- *
- */
-
-static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
-{
- struct gfs2_sbd *sdp = gl->gl_sbd;
- const struct gfs2_glock_operations *glops = gl->gl_ops;
- struct gfs2_holder *gh = gl->gl_req_gh;
- int op_done = 1;
-
- if (!gh && (ret & LM_OUT_ST_MASK) == LM_ST_UNLOCKED) {
- drop_bh(gl, ret);
- return;
- }
-
- gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
- gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
- gfs2_assert_warn(sdp, !(ret & LM_OUT_ASYNC));
-
- state_change(gl, ret & LM_OUT_ST_MASK);
-
- /* Deal with each possible exit condition */
-
- if (!gh) {
- gl->gl_stamp = jiffies;
- if (ret & LM_OUT_CANCELED) {
- op_done = 0;
- } else {
- spin_lock(&gl->gl_spin);
- if (gl->gl_state != gl->gl_demote_state) {
- spin_unlock(&gl->gl_spin);
- gfs2_glock_drop_th(gl);
- gfs2_glock_put(gl);
- return;
- }
- gfs2_demote_wake(gl);
- spin_unlock(&gl->gl_spin);
- }
- } else {
- spin_lock(&gl->gl_spin);
- if (ret & LM_OUT_CONV_DEADLK) {
- gh->gh_error = 0;
- set_bit(GLF_CONV_DEADLK, &gl->gl_flags);
- spin_unlock(&gl->gl_spin);
- gfs2_glock_drop_th(gl);
- gfs2_glock_put(gl);
- return;
- }
- list_del_init(&gh->gh_list);
- gh->gh_error = -EIO;
- if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
- goto out;
- gh->gh_error = GLR_CANCELED;
- if (ret & LM_OUT_CANCELED)
- goto out;
- if (relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
- list_add_tail(&gh->gh_list, &gl->gl_holders);
- gh->gh_error = 0;
- set_bit(HIF_HOLDER, &gh->gh_iflags);
- set_bit(HIF_FIRST, &gh->gh_iflags);
- op_done = 0;
- goto out;
- }
- gh->gh_error = GLR_TRYFAILED;
- if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))
- goto out;
- gh->gh_error = -EINVAL;
- if (gfs2_assert_withdraw(sdp, 0) == -1)
- fs_err(sdp, "ret = 0x%.8X\n", ret);
-out:
- spin_unlock(&gl->gl_spin);
- }
-
- if (glops->go_xmote_bh)
- glops->go_xmote_bh(gl);
-
- if (op_done) {
- spin_lock(&gl->gl_spin);
- gl->gl_req_gh = NULL;
- clear_bit(GLF_LOCK, &gl->gl_flags);
- spin_unlock(&gl->gl_spin);
- }
-
- gfs2_glock_put(gl);
-
- if (gh)
- gfs2_holder_wake(gh);
-}
-
-static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
- unsigned int cur_state, unsigned int req_state,
- unsigned int flags)
-{
- int ret = 0;
- if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
- ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state,
- req_state, flags);
- return ret;
-}
-
-/**
- * gfs2_glock_xmote_th - Call into the lock module to acquire or change a glock
- * @gl: The glock in question
- * @state: the requested state
- * @flags: modifier flags to the lock call
- *
- */
-
-static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh)
-{
- struct gfs2_sbd *sdp = gl->gl_sbd;
- int flags = gh ? gh->gh_flags : 0;
- unsigned state = gh ? gh->gh_state : gl->gl_demote_state;
- const struct gfs2_glock_operations *glops = gl->gl_ops;
- int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB |
- LM_FLAG_NOEXP | LM_FLAG_ANY |
- LM_FLAG_PRIORITY);
- unsigned int lck_ret;
-
- if (glops->go_xmote_th)
- glops->go_xmote_th(gl);
- if (state == LM_ST_DEFERRED && glops->go_inval)
- glops->go_inval(gl, DIO_METADATA);
-
- gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
- gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
- gfs2_assert_warn(sdp, state != LM_ST_UNLOCKED);
- gfs2_assert_warn(sdp, state != gl->gl_state);
-
- gfs2_glock_hold(gl);
-
- lck_ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, state, lck_flags);
-
- if (gfs2_assert_withdraw(sdp, !(lck_ret & LM_OUT_ERROR)))
- return;
-
- if (lck_ret & LM_OUT_ASYNC)
- gfs2_assert_warn(sdp, lck_ret == LM_OUT_ASYNC);
- else
- xmote_bh(gl, lck_ret);
-}
-
-static unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock,
- unsigned int cur_state)
-{
- int ret = 0;
- if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
- ret = sdp->sd_lockstruct.ls_ops->lm_unlock(lock, cur_state);
- return ret;
-}
-
-/**
- * gfs2_glock_drop_th - call into the lock module to unlock a lock
- * @gl: the glock
- *
- */
-
-static void gfs2_glock_drop_th(struct gfs2_glock *gl)
-{
- struct gfs2_sbd *sdp = gl->gl_sbd;
- const struct gfs2_glock_operations *glops = gl->gl_ops;
- unsigned int ret;
-
- if (glops->go_xmote_th)
- glops->go_xmote_th(gl);
- if (glops->go_inval)
- glops->go_inval(gl, DIO_METADATA);
-
- gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
- gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
- gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED);
-
- gfs2_glock_hold(gl);
-
- ret = gfs2_lm_unlock(sdp, gl->gl_lock, gl->gl_state);
-
- if (gfs2_assert_withdraw(sdp, !(ret & LM_OUT_ERROR)))
- return;
-
- if (!ret)
- drop_bh(gl, ret);
- else
- gfs2_assert_warn(sdp, ret == LM_OUT_ASYNC);
-}
-
-/**
- * do_cancels - cancel requests for locks stuck waiting on an expire flag
- * @gh: the LM_FLAG_PRIORITY holder waiting to acquire the lock
- *
- * Don't cancel GL_NOCANCEL requests.
- */
-
-static void do_cancels(struct gfs2_holder *gh)
-{
- struct gfs2_glock *gl = gh->gh_gl;
- struct gfs2_sbd *sdp = gl->gl_sbd;
-
- spin_lock(&gl->gl_spin);
-
- while (gl->gl_req_gh != gh &&
- !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
- !list_empty(&gh->gh_list)) {
- if (!(gl->gl_req_gh && (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) {
- spin_unlock(&gl->gl_spin);
- if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
- sdp->sd_lockstruct.ls_ops->lm_cancel(gl->gl_lock);
- msleep(100);
- spin_lock(&gl->gl_spin);
- } else {
- spin_unlock(&gl->gl_spin);
- msleep(100);
- spin_lock(&gl->gl_spin);
- }
- }
-
- spin_unlock(&gl->gl_spin);
-}
-
-/**
- * glock_wait_internal - wait on a glock acquisition
+ * gfs2_glock_wait - wait on a glock acquisition
* @gh: the glock holder
*
* Returns: 0 on success
*/
-static int glock_wait_internal(struct gfs2_holder *gh)
+int gfs2_glock_wait(struct gfs2_holder *gh)
{
- struct gfs2_glock *gl = gh->gh_gl;
- struct gfs2_sbd *sdp = gl->gl_sbd;
- const struct gfs2_glock_operations *glops = gl->gl_ops;
-
- if (test_bit(HIF_ABORTED, &gh->gh_iflags))
- return -EIO;
-
- if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
- spin_lock(&gl->gl_spin);
- if (gl->gl_req_gh != gh &&
- !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
- !list_empty(&gh->gh_list)) {
- list_del_init(&gh->gh_list);
- gh->gh_error = GLR_TRYFAILED;
- run_queue(gl);
- spin_unlock(&gl->gl_spin);
- return gh->gh_error;
- }
- spin_unlock(&gl->gl_spin);
- }
-
- if (gh->gh_flags & LM_FLAG_PRIORITY)
- do_cancels(gh);
-
wait_on_holder(gh);
- if (gh->gh_error)
- return gh->gh_error;
-
- gfs2_assert_withdraw(sdp, test_bit(HIF_HOLDER, &gh->gh_iflags));
- gfs2_assert_withdraw(sdp, relaxed_state_ok(gl->gl_state, gh->gh_state,
- gh->gh_flags));
-
- if (test_bit(HIF_FIRST, &gh->gh_iflags)) {
- gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
-
- if (glops->go_lock) {
- gh->gh_error = glops->go_lock(gh);
- if (gh->gh_error) {
- spin_lock(&gl->gl_spin);
- list_del_init(&gh->gh_list);
- spin_unlock(&gl->gl_spin);
- }
- }
-
- spin_lock(&gl->gl_spin);
- gl->gl_req_gh = NULL;
- clear_bit(GLF_LOCK, &gl->gl_flags);
- run_queue(gl);
- spin_unlock(&gl->gl_spin);
- }
-
return gh->gh_error;
}
-static inline struct gfs2_holder *
-find_holder_by_owner(struct list_head *head, struct pid *pid)
-{
- struct gfs2_holder *gh;
-
- list_for_each_entry(gh, head, gh_list) {
- if (gh->gh_owner_pid == pid)
- return gh;
- }
-
- return NULL;
-}
-
-static void print_dbg(struct glock_iter *gi, const char *fmt, ...)
+void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
{
va_list args;
va_start(args, fmt);
- if (gi) {
+ if (seq) {
+ struct gfs2_glock_iter *gi = seq->private;
vsprintf(gi->string, fmt, args);
- seq_printf(gi->seq, gi->string);
- }
- else
+ seq_printf(seq, gi->string);
+ } else {
+ printk(KERN_ERR " ");
vprintk(fmt, args);
+ }
va_end(args);
}
@@ -1104,50 +870,76 @@ static void print_dbg(struct glock_iter *gi, const char *fmt, ...)
* add_to_queue - Add a holder to the wait queue (but look for recursion)
* @gh: the holder structure to add
*
+ * Eventually we should move the recursive locking trap to a
+ * debugging option or something like that. This is the fast
+ * path and needs to have the minimum number of distractions.
+ *
*/
-static void add_to_queue(struct gfs2_holder *gh)
+static inline void add_to_queue(struct gfs2_holder *gh)
{
struct gfs2_glock *gl = gh->gh_gl;
- struct gfs2_holder *existing;
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct list_head *insert_pt = NULL;
+ struct gfs2_holder *gh2;
+ int try_lock = 0;
BUG_ON(gh->gh_owner_pid == NULL);
if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags))
BUG();
- if (!(gh->gh_flags & GL_FLOCK)) {
- existing = find_holder_by_owner(&gl->gl_holders,
- gh->gh_owner_pid);
- if (existing) {
- print_symbol(KERN_WARNING "original: %s\n",
- existing->gh_ip);
- printk(KERN_INFO "pid : %d\n",
- pid_nr(existing->gh_owner_pid));
- printk(KERN_INFO "lock type : %d lock state : %d\n",
- existing->gh_gl->gl_name.ln_type,
- existing->gh_gl->gl_state);
- print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
- printk(KERN_INFO "pid : %d\n",
- pid_nr(gh->gh_owner_pid));
- printk(KERN_INFO "lock type : %d lock state : %d\n",
- gl->gl_name.ln_type, gl->gl_state);
- BUG();
- }
-
- existing = find_holder_by_owner(&gl->gl_waiters3,
- gh->gh_owner_pid);
- if (existing) {
- print_symbol(KERN_WARNING "original: %s\n",
- existing->gh_ip);
- print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
- BUG();
+ if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
+ if (test_bit(GLF_LOCK, &gl->gl_flags))
+ try_lock = 1;
+ if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
+ goto fail;
+ }
+
+ list_for_each_entry(gh2, &gl->gl_holders, gh_list) {
+ if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid &&
+ (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK)))
+ goto trap_recursive;
+ if (try_lock &&
+ !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) &&
+ !may_grant(gl, gh)) {
+fail:
+ gh->gh_error = GLR_TRYFAILED;
+ gfs2_holder_wake(gh);
+ return;
}
+ if (test_bit(HIF_HOLDER, &gh2->gh_iflags))
+ continue;
+ if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt))
+ insert_pt = &gh2->gh_list;
+ }
+ if (likely(insert_pt == NULL)) {
+ list_add_tail(&gh->gh_list, &gl->gl_holders);
+ if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY))
+ goto do_cancel;
+ return;
+ }
+ list_add_tail(&gh->gh_list, insert_pt);
+do_cancel:
+ gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
+ if (!(gh->gh_flags & LM_FLAG_PRIORITY)) {
+ spin_unlock(&gl->gl_spin);
+ if (sdp->sd_lockstruct.ls_ops->lm_cancel)
+ sdp->sd_lockstruct.ls_ops->lm_cancel(gl->gl_lock);
+ spin_lock(&gl->gl_spin);
}
+ return;
- if (gh->gh_flags & LM_FLAG_PRIORITY)
- list_add(&gh->gh_list, &gl->gl_waiters3);
- else
- list_add_tail(&gh->gh_list, &gl->gl_waiters3);
+trap_recursive:
+ print_symbol(KERN_ERR "original: %s\n", gh2->gh_ip);
+ printk(KERN_ERR "pid: %d\n", pid_nr(gh2->gh_owner_pid));
+ printk(KERN_ERR "lock type: %d req lock state : %d\n",
+ gh2->gh_gl->gl_name.ln_type, gh2->gh_state);
+ print_symbol(KERN_ERR "new: %s\n", gh->gh_ip);
+ printk(KERN_ERR "pid: %d\n", pid_nr(gh->gh_owner_pid));
+ printk(KERN_ERR "lock type: %d req lock state : %d\n",
+ gh->gh_gl->gl_name.ln_type, gh->gh_state);
+ __dump_glock(NULL, gl);
+ BUG();
}
/**
@@ -1165,24 +957,16 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
struct gfs2_sbd *sdp = gl->gl_sbd;
int error = 0;
-restart:
- if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
- set_bit(HIF_ABORTED, &gh->gh_iflags);
+ if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
return -EIO;
- }
spin_lock(&gl->gl_spin);
add_to_queue(gh);
- run_queue(gl);
+ run_queue(gl, 1);
spin_unlock(&gl->gl_spin);
- if (!(gh->gh_flags & GL_ASYNC)) {
- error = glock_wait_internal(gh);
- if (error == GLR_CANCELED) {
- msleep(100);
- goto restart;
- }
- }
+ if (!(gh->gh_flags & GL_ASYNC))
+ error = gfs2_glock_wait(gh);
return error;
}
@@ -1196,48 +980,7 @@ restart:
int gfs2_glock_poll(struct gfs2_holder *gh)
{
- struct gfs2_glock *gl = gh->gh_gl;
- int ready = 0;
-
- spin_lock(&gl->gl_spin);
-
- if (test_bit(HIF_HOLDER, &gh->gh_iflags))
- ready = 1;
- else if (list_empty(&gh->gh_list)) {
- if (gh->gh_error == GLR_CANCELED) {
- spin_unlock(&gl->gl_spin);
- msleep(100);
- if (gfs2_glock_nq(gh))
- return 1;
- return 0;
- } else
- ready = 1;
- }
-
- spin_unlock(&gl->gl_spin);
-
- return ready;
-}
-
-/**
- * gfs2_glock_wait - wait for a lock acquisition that ended in a GLR_ASYNC
- * @gh: the holder structure
- *
- * Returns: 0, GLR_TRYFAILED, or errno on failure
- */
-
-int gfs2_glock_wait(struct gfs2_holder *gh)
-{
- int error;
-
- error = glock_wait_internal(gh);
- if (error == GLR_CANCELED) {
- msleep(100);
- gh->gh_flags &= ~GL_ASYNC;
- error = gfs2_glock_nq(gh);
- }
-
- return error;
+ return test_bit(HIF_WAIT, &gh->gh_iflags) ? 0 : 1;
}
/**
@@ -1251,26 +994,30 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
struct gfs2_glock *gl = gh->gh_gl;
const struct gfs2_glock_operations *glops = gl->gl_ops;
unsigned delay = 0;
+ int fast_path = 0;
+ spin_lock(&gl->gl_spin);
if (gh->gh_flags & GL_NOCACHE)
handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
- gfs2_glmutex_lock(gl);
-
- spin_lock(&gl->gl_spin);
list_del_init(&gh->gh_list);
-
- if (list_empty(&gl->gl_holders)) {
+ if (find_first_holder(gl) == NULL) {
if (glops->go_unlock) {
+ GLOCK_BUG_ON(gl, test_and_set_bit(GLF_LOCK, &gl->gl_flags));
spin_unlock(&gl->gl_spin);
glops->go_unlock(gh);
spin_lock(&gl->gl_spin);
+ clear_bit(GLF_LOCK, &gl->gl_flags);
}
gl->gl_stamp = jiffies;
+ if (list_empty(&gl->gl_holders) &&
+ !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
+ !test_bit(GLF_DEMOTE, &gl->gl_flags))
+ fast_path = 1;
}
-
- clear_bit(GLF_LOCK, &gl->gl_flags);
spin_unlock(&gl->gl_spin);
+ if (likely(fast_path))
+ return;
gfs2_glock_hold(gl);
if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
@@ -1454,6 +1201,8 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
static int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp)
{
int error = -EIO;
+ if (!sdp->sd_lockstruct.ls_ops->lm_hold_lvb)
+ return 0;
if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp);
return error;
@@ -1469,20 +1218,14 @@ int gfs2_lvb_hold(struct gfs2_glock *gl)
{
int error;
- gfs2_glmutex_lock(gl);
-
if (!atomic_read(&gl->gl_lvb_count)) {
error = gfs2_lm_hold_lvb(gl->gl_sbd, gl->gl_lock, &gl->gl_lvb);
- if (error) {
- gfs2_glmutex_unlock(gl);
+ if (error)
return error;
- }
gfs2_glock_hold(gl);
}
atomic_inc(&gl->gl_lvb_count);
- gfs2_glmutex_unlock(gl);
-
return 0;
}
@@ -1497,17 +1240,13 @@ void gfs2_lvb_unhold(struct gfs2_glock *gl)
struct gfs2_sbd *sdp = gl->gl_sbd;
gfs2_glock_hold(gl);
- gfs2_glmutex_lock(gl);
-
gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0);
if (atomic_dec_and_test(&gl->gl_lvb_count)) {
- if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+ if (sdp->sd_lockstruct.ls_ops->lm_unhold_lvb)
sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(gl->gl_lock, gl->gl_lvb);
gl->gl_lvb = NULL;
gfs2_glock_put(gl);
}
-
- gfs2_glmutex_unlock(gl);
gfs2_glock_put(gl);
}
@@ -1526,8 +1265,12 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
if (time_before(now, holdtime))
delay = holdtime - now;
+ if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
+ delay = gl->gl_ops->go_min_hold_time;
+ spin_lock(&gl->gl_spin);
handle_callback(gl, state, 1, delay);
+ spin_unlock(&gl->gl_spin);
if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
gfs2_glock_put(gl);
}
@@ -1568,7 +1311,8 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
gl = gfs2_glock_find(sdp, &async->lc_name);
if (gfs2_assert_warn(sdp, gl))
return;
- xmote_bh(gl, async->lc_ret);
+ gl->gl_reply = async->lc_ret;
+ set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
gfs2_glock_put(gl);
up_read(&gfs2_umount_flush_sem);
@@ -1581,11 +1325,6 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
wake_up_process(sdp->sd_recoverd_process);
return;
- case LM_CB_DROPLOCKS:
- gfs2_gl_hash_clear(sdp, NO_WAIT);
- gfs2_quota_scan(sdp);
- return;
-
default:
gfs2_assert_warn(sdp, 0);
return;
@@ -1646,6 +1385,7 @@ void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
{
struct gfs2_glock *gl;
+ int done_callback = 0;
spin_lock(&sdp->sd_reclaim_lock);
if (list_empty(&sdp->sd_reclaim_list)) {
@@ -1660,14 +1400,16 @@ void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
atomic_dec(&sdp->sd_reclaim_count);
atomic_inc(&sdp->sd_reclaimed);
- if (gfs2_glmutex_trylock(gl)) {
- if (list_empty(&gl->gl_holders) &&
- gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
- handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
- gfs2_glmutex_unlock(gl);
+ spin_lock(&gl->gl_spin);
+ if (find_first_holder(gl) == NULL &&
+ gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) {
+ handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
+ done_callback = 1;
}
-
- gfs2_glock_put(gl);
+ spin_unlock(&gl->gl_spin);
+ if (!done_callback ||
+ queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+ gfs2_glock_put(gl);
}
/**
@@ -1724,18 +1466,14 @@ static void scan_glock(struct gfs2_glock *gl)
{
if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object)
return;
+ if (test_bit(GLF_LOCK, &gl->gl_flags))
+ return;
- if (gfs2_glmutex_trylock(gl)) {
- if (list_empty(&gl->gl_holders) &&
- gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
- goto out_schedule;
- gfs2_glmutex_unlock(gl);
- }
- return;
-
-out_schedule:
- gfs2_glmutex_unlock(gl);
- gfs2_glock_schedule_for_reclaim(gl);
+ spin_lock(&gl->gl_spin);
+ if (find_first_holder(gl) == NULL &&
+ gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
+ gfs2_glock_schedule_for_reclaim(gl);
+ spin_unlock(&gl->gl_spin);
}
/**
@@ -1760,12 +1498,13 @@ static void clear_glock(struct gfs2_glock *gl)
spin_unlock(&sdp->sd_reclaim_lock);
}
- if (gfs2_glmutex_trylock(gl)) {
- if (list_empty(&gl->gl_holders) &&
- gl->gl_state != LM_ST_UNLOCKED)
- handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
- gfs2_glmutex_unlock(gl);
- }
+ spin_lock(&gl->gl_spin);
+ if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED)
+ handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
+ spin_unlock(&gl->gl_spin);
+ gfs2_glock_hold(gl);
+ if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+ gfs2_glock_put(gl);
}
/**
@@ -1773,11 +1512,10 @@ static void clear_glock(struct gfs2_glock *gl)
* @sdp: the filesystem
* @wait: wait until it's all gone
*
- * Called when unmounting the filesystem, or when inter-node lock manager
- * requests DROPLOCKS because it is running out of capacity.
+ * Called when unmounting the filesystem.
*/
-void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
+void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
{
unsigned long t;
unsigned int x;
@@ -1792,7 +1530,7 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
cont = 1;
}
- if (!wait || !cont)
+ if (!cont)
break;
if (time_after_eq(jiffies,
@@ -1810,180 +1548,162 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
}
}
-/*
- * Diagnostic routines to help debug distributed deadlock
- */
-
-static void gfs2_print_symbol(struct glock_iter *gi, const char *fmt,
- unsigned long address)
+static const char *state2str(unsigned state)
{
- char buffer[KSYM_SYMBOL_LEN];
-
- sprint_symbol(buffer, address);
- print_dbg(gi, fmt, buffer);
+ switch(state) {
+ case LM_ST_UNLOCKED:
+ return "UN";
+ case LM_ST_SHARED:
+ return "SH";
+ case LM_ST_DEFERRED:
+ return "DF";
+ case LM_ST_EXCLUSIVE:
+ return "EX";
+ }
+ return "??";
+}
+
+static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
+{
+ char *p = buf;
+ if (flags & LM_FLAG_TRY)
+ *p++ = 't';
+ if (flags & LM_FLAG_TRY_1CB)
+ *p++ = 'T';
+ if (flags & LM_FLAG_NOEXP)
+ *p++ = 'e';
+ if (flags & LM_FLAG_ANY)
+ *p++ = 'a';
+ if (flags & LM_FLAG_PRIORITY)
+ *p++ = 'p';
+ if (flags & GL_ASYNC)
+ *p++ = 'a';
+ if (flags & GL_EXACT)
+ *p++ = 'E';
+ if (flags & GL_NOCACHE)
+ *p++ = 'c';
+ if (test_bit(HIF_HOLDER, &iflags))
+ *p++ = 'H';
+ if (test_bit(HIF_WAIT, &iflags))
+ *p++ = 'W';
+ if (test_bit(HIF_FIRST, &iflags))
+ *p++ = 'F';
+ *p = 0;
+ return buf;
}
/**
* dump_holder - print information about a glock holder
- * @str: a string naming the type of holder
+ * @seq: the seq_file struct
* @gh: the glock holder
*
* Returns: 0 on success, -ENOBUFS when we run out of space
*/
-static int dump_holder(struct glock_iter *gi, char *str,
- struct gfs2_holder *gh)
+static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
{
- unsigned int x;
- struct task_struct *gh_owner;
+ struct task_struct *gh_owner = NULL;
+ char buffer[KSYM_SYMBOL_LEN];
+ char flags_buf[32];
- print_dbg(gi, " %s\n", str);
- if (gh->gh_owner_pid) {
- print_dbg(gi, " owner = %ld ",
- (long)pid_nr(gh->gh_owner_pid));
+ sprint_symbol(buffer, gh->gh_ip);
+ if (gh->gh_owner_pid)
gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID);
- if (gh_owner)
- print_dbg(gi, "(%s)\n", gh_owner->comm);
- else
- print_dbg(gi, "(ended)\n");
- } else
- print_dbg(gi, " owner = -1\n");
- print_dbg(gi, " gh_state = %u\n", gh->gh_state);
- print_dbg(gi, " gh_flags =");
- for (x = 0; x < 32; x++)
- if (gh->gh_flags & (1 << x))
- print_dbg(gi, " %u", x);
- print_dbg(gi, " \n");
- print_dbg(gi, " error = %d\n", gh->gh_error);
- print_dbg(gi, " gh_iflags =");
- for (x = 0; x < 32; x++)
- if (test_bit(x, &gh->gh_iflags))
- print_dbg(gi, " %u", x);
- print_dbg(gi, " \n");
- gfs2_print_symbol(gi, " initialized at: %s\n", gh->gh_ip);
-
+ gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %s\n",
+ state2str(gh->gh_state),
+ hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
+ gh->gh_error,
+ gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
+ gh_owner ? gh_owner->comm : "(ended)", buffer);
return 0;
}
-/**
- * dump_inode - print information about an inode
- * @ip: the inode
- *
- * Returns: 0 on success, -ENOBUFS when we run out of space
- */
-
-static int dump_inode(struct glock_iter *gi, struct gfs2_inode *ip)
-{
- unsigned int x;
-
- print_dbg(gi, " Inode:\n");
- print_dbg(gi, " num = %llu/%llu\n",
- (unsigned long long)ip->i_no_formal_ino,
- (unsigned long long)ip->i_no_addr);
- print_dbg(gi, " type = %u\n", IF2DT(ip->i_inode.i_mode));
- print_dbg(gi, " i_flags =");
- for (x = 0; x < 32; x++)
- if (test_bit(x, &ip->i_flags))
- print_dbg(gi, " %u", x);
- print_dbg(gi, " \n");
- return 0;
+static const char *gflags2str(char *buf, const unsigned long *gflags)
+{
+ char *p = buf;
+ if (test_bit(GLF_LOCK, gflags))
+ *p++ = 'l';
+ if (test_bit(GLF_STICKY, gflags))
+ *p++ = 's';
+ if (test_bit(GLF_DEMOTE, gflags))
+ *p++ = 'D';
+ if (test_bit(GLF_PENDING_DEMOTE, gflags))
+ *p++ = 'd';
+ if (test_bit(GLF_DEMOTE_IN_PROGRESS, gflags))
+ *p++ = 'p';
+ if (test_bit(GLF_DIRTY, gflags))
+ *p++ = 'y';
+ if (test_bit(GLF_LFLUSH, gflags))
+ *p++ = 'f';
+ if (test_bit(GLF_INVALIDATE_IN_PROGRESS, gflags))
+ *p++ = 'i';
+ if (test_bit(GLF_REPLY_PENDING, gflags))
+ *p++ = 'r';
+ *p = 0;
+ return buf;
}
/**
- * dump_glock - print information about a glock
+ * __dump_glock - print information about a glock
+ * @seq: The seq_file struct
* @gl: the glock
- * @count: where we are in the buffer
+ *
+ * The file format is as follows:
+ * One line per object, capital letters are used to indicate objects
+ * G = glock, I = Inode, R = rgrp, H = holder. Glocks are not indented,
+ * other objects are indented by a single space and follow the glock to
+ * which they are related. Fields are indicated by lower case letters
+ * followed by a colon and the field value, except for strings which are in
+ * [] so that its possible to see if they are composed of spaces for
+ * example. The field's are n = number (id of the object), f = flags,
+ * t = type, s = state, r = refcount, e = error, p = pid.
*
* Returns: 0 on success, -ENOBUFS when we run out of space
*/
-static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl)
+static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
{
- struct gfs2_holder *gh;
- unsigned int x;
- int error = -ENOBUFS;
- struct task_struct *gl_owner;
+ const struct gfs2_glock_operations *glops = gl->gl_ops;
+ unsigned long long dtime;
+ const struct gfs2_holder *gh;
+ char gflags_buf[32];
+ int error = 0;
- spin_lock(&gl->gl_spin);
+ dtime = jiffies - gl->gl_demote_time;
+ dtime *= 1000000/HZ; /* demote time in uSec */
+ if (!test_bit(GLF_DEMOTE, &gl->gl_flags))
+ dtime = 0;
+ gfs2_print_dbg(seq, "G: s:%s n:%u/%llu f:%s t:%s d:%s/%llu l:%d a:%d r:%d\n",
+ state2str(gl->gl_state),
+ gl->gl_name.ln_type,
+ (unsigned long long)gl->gl_name.ln_number,
+ gflags2str(gflags_buf, &gl->gl_flags),
+ state2str(gl->gl_target),
+ state2str(gl->gl_demote_state), dtime,
+ atomic_read(&gl->gl_lvb_count),
+ atomic_read(&gl->gl_ail_count),
+ atomic_read(&gl->gl_ref));
- print_dbg(gi, "Glock 0x%p (%u, 0x%llx)\n", gl, gl->gl_name.ln_type,
- (unsigned long long)gl->gl_name.ln_number);
- print_dbg(gi, " gl_flags =");
- for (x = 0; x < 32; x++) {
- if (test_bit(x, &gl->gl_flags))
- print_dbg(gi, " %u", x);
- }
- if (!test_bit(GLF_LOCK, &gl->gl_flags))
- print_dbg(gi, " (unlocked)");
- print_dbg(gi, " \n");
- print_dbg(gi, " gl_ref = %d\n", atomic_read(&gl->gl_ref));
- print_dbg(gi, " gl_state = %u\n", gl->gl_state);
- if (gl->gl_owner_pid) {
- gl_owner = pid_task(gl->gl_owner_pid, PIDTYPE_PID);
- if (gl_owner)
- print_dbg(gi, " gl_owner = pid %d (%s)\n",
- pid_nr(gl->gl_owner_pid), gl_owner->comm);
- else
- print_dbg(gi, " gl_owner = %d (ended)\n",
- pid_nr(gl->gl_owner_pid));
- } else
- print_dbg(gi, " gl_owner = -1\n");
- print_dbg(gi, " gl_ip = %lu\n", gl->gl_ip);
- print_dbg(gi, " req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no");
- print_dbg(gi, " lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
- print_dbg(gi, " object = %s\n", (gl->gl_object) ? "yes" : "no");
- print_dbg(gi, " reclaim = %s\n",
- (list_empty(&gl->gl_reclaim)) ? "no" : "yes");
- if (gl->gl_aspace)
- print_dbg(gi, " aspace = 0x%p nrpages = %lu\n", gl->gl_aspace,
- gl->gl_aspace->i_mapping->nrpages);
- else
- print_dbg(gi, " aspace = no\n");
- print_dbg(gi, " ail = %d\n", atomic_read(&gl->gl_ail_count));
- if (gl->gl_req_gh) {
- error = dump_holder(gi, "Request", gl->gl_req_gh);
- if (error)
- goto out;
- }
list_for_each_entry(gh, &gl->gl_holders, gh_list) {
- error = dump_holder(gi, "Holder", gh);
- if (error)
- goto out;
- }
- list_for_each_entry(gh, &gl->gl_waiters1, gh_list) {
- error = dump_holder(gi, "Waiter1", gh);
+ error = dump_holder(seq, gh);
if (error)
goto out;
}
- list_for_each_entry(gh, &gl->gl_waiters3, gh_list) {
- error = dump_holder(gi, "Waiter3", gh);
- if (error)
- goto out;
- }
- if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
- print_dbg(gi, " Demotion req to state %u (%llu uS ago)\n",
- gl->gl_demote_state, (unsigned long long)
- (jiffies - gl->gl_demote_time)*(1000000/HZ));
- }
- if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) {
- if (!test_bit(GLF_LOCK, &gl->gl_flags) &&
- list_empty(&gl->gl_holders)) {
- error = dump_inode(gi, gl->gl_object);
- if (error)
- goto out;
- } else {
- error = -ENOBUFS;
- print_dbg(gi, " Inode: busy\n");
- }
- }
-
- error = 0;
-
+ if (gl->gl_state != LM_ST_UNLOCKED && glops->go_dump)
+ error = glops->go_dump(seq, gl);
out:
- spin_unlock(&gl->gl_spin);
return error;
}
+static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
+{
+ int ret;
+ spin_lock(&gl->gl_spin);
+ ret = __dump_glock(seq, gl);
+ spin_unlock(&gl->gl_spin);
+ return ret;
+}
+
/**
* gfs2_dump_lockstate - print out the current lockstate
* @sdp: the filesystem
@@ -2086,7 +1806,7 @@ void gfs2_glock_exit(void)
module_param(scand_secs, uint, S_IRUGO|S_IWUSR);
MODULE_PARM_DESC(scand_secs, "The number of seconds between scand runs");
-static int gfs2_glock_iter_next(struct glock_iter *gi)
+static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
{
struct gfs2_glock *gl;
@@ -2096,15 +1816,17 @@ restart:
if (gl) {
gi->gl = hlist_entry(gl->gl_list.next,
struct gfs2_glock, gl_list);
- if (gi->gl)
- gfs2_glock_hold(gi->gl);
+ } else {
+ gi->gl = hlist_entry(gl_hash_table[gi->hash].hb_list.first,
+ struct gfs2_glock, gl_list);
}
+ if (gi->gl)
+ gfs2_glock_hold(gi->gl);
read_unlock(gl_lock_addr(gi->hash));
if (gl)
gfs2_glock_put(gl);
- if (gl && gi->gl == NULL)
+ while (gi->gl == NULL) {
gi->hash++;
- while(gi->gl == NULL) {
if (gi->hash >= GFS2_GL_HASH_SIZE)
return 1;
read_lock(gl_lock_addr(gi->hash));
@@ -2113,7 +1835,6 @@ restart:
if (gi->gl)
gfs2_glock_hold(gi->gl);
read_unlock(gl_lock_addr(gi->hash));
- gi->hash++;
}
if (gi->sdp != gi->gl->gl_sbd)
@@ -2122,58 +1843,34 @@ restart:
return 0;
}
-static void gfs2_glock_iter_free(struct glock_iter *gi)
+static void gfs2_glock_iter_free(struct gfs2_glock_iter *gi)
{
if (gi->gl)
gfs2_glock_put(gi->gl);
- kfree(gi);
-}
-
-static struct glock_iter *gfs2_glock_iter_init(struct gfs2_sbd *sdp)
-{
- struct glock_iter *gi;
-
- gi = kmalloc(sizeof (*gi), GFP_KERNEL);
- if (!gi)
- return NULL;
-
- gi->sdp = sdp;
- gi->hash = 0;
- gi->seq = NULL;
gi->gl = NULL;
- memset(gi->string, 0, sizeof(gi->string));
-
- if (gfs2_glock_iter_next(gi)) {
- gfs2_glock_iter_free(gi);
- return NULL;
- }
-
- return gi;
}
-static void *gfs2_glock_seq_start(struct seq_file *file, loff_t *pos)
+static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos)
{
- struct glock_iter *gi;
+ struct gfs2_glock_iter *gi = seq->private;
loff_t n = *pos;
- gi = gfs2_glock_iter_init(file->private);
- if (!gi)
- return NULL;
+ gi->hash = 0;
- while(n--) {
+ do {
if (gfs2_glock_iter_next(gi)) {
gfs2_glock_iter_free(gi);
return NULL;
}
- }
+ } while (n--);
- return gi;
+ return gi->gl;
}
-static void *gfs2_glock_seq_next(struct seq_file *file, void *iter_ptr,
+static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr,
loff_t *pos)
{
- struct glock_iter *gi = iter_ptr;
+ struct gfs2_glock_iter *gi = seq->private;
(*pos)++;
@@ -2182,24 +1879,18 @@ static void *gfs2_glock_seq_next(struct seq_file *file, void *iter_ptr,
return NULL;
}
- return gi;
+ return gi->gl;
}
-static void gfs2_glock_seq_stop(struct seq_file *file, void *iter_ptr)
+static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr)
{
- struct glock_iter *gi = iter_ptr;
- if (gi)
- gfs2_glock_iter_free(gi);
+ struct gfs2_glock_iter *gi = seq->private;
+ gfs2_glock_iter_free(gi);
}
-static int gfs2_glock_seq_show(struct seq_file *file, void *iter_ptr)
+static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
{
- struct glock_iter *gi = iter_ptr;
-
- gi->seq = file;
- dump_glock(gi, gi->gl);
-
- return 0;
+ return dump_glock(seq, iter_ptr);
}
static const struct seq_operations gfs2_glock_seq_ops = {
@@ -2211,17 +1902,14 @@ static const struct seq_operations gfs2_glock_seq_ops = {
static int gfs2_debugfs_open(struct inode *inode, struct file *file)
{
- struct seq_file *seq;
- int ret;
-
- ret = seq_open(file, &gfs2_glock_seq_ops);
- if (ret)
- return ret;
-
- seq = file->private_data;
- seq->private = inode->i_private;
-
- return 0;
+ int ret = seq_open_private(file, &gfs2_glock_seq_ops,
+ sizeof(struct gfs2_glock_iter));
+ if (ret == 0) {
+ struct seq_file *seq = file->private_data;
+ struct gfs2_glock_iter *gi = seq->private;
+ gi->sdp = inode->i_private;
+ }
+ return ret;
}
static const struct file_operations gfs2_debug_fops = {
@@ -2229,7 +1917,7 @@ static const struct file_operations gfs2_debug_fops = {
.open = gfs2_debugfs_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release
+ .release = seq_release_private,
};
int gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index cdad3e6f815..695c6b19361 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -24,13 +24,9 @@
#define GL_ASYNC 0x00000040
#define GL_EXACT 0x00000080
#define GL_SKIP 0x00000100
-#define GL_ATIME 0x00000200
#define GL_NOCACHE 0x00000400
-#define GL_FLOCK 0x00000800
-#define GL_NOCANCEL 0x00001000
#define GLR_TRYFAILED 13
-#define GLR_CANCELED 14
static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
{
@@ -41,6 +37,8 @@ static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *
spin_lock(&gl->gl_spin);
pid = task_pid(current);
list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+ if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
+ break;
if (gh->gh_owner_pid == pid)
goto out;
}
@@ -70,7 +68,7 @@ static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
{
int ret;
spin_lock(&gl->gl_spin);
- ret = test_bit(GLF_DEMOTE, &gl->gl_flags) || !list_empty(&gl->gl_waiters3);
+ ret = test_bit(GLF_DEMOTE, &gl->gl_flags);
spin_unlock(&gl->gl_spin);
return ret;
}
@@ -98,6 +96,7 @@ int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
+void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
/**
* gfs2_glock_nq_init - intialize a holder and enqueue it on a glock
@@ -130,10 +129,9 @@ int gfs2_lvb_hold(struct gfs2_glock *gl);
void gfs2_lvb_unhold(struct gfs2_glock *gl);
void gfs2_glock_cb(void *cb_data, unsigned int type, void *data);
-
void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
-void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait);
+void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
int __init gfs2_glock_init(void);
void gfs2_glock_exit(void);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 07d84d16cda..c6c318c2a0f 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -13,6 +13,7 @@
#include <linux/buffer_head.h>
#include <linux/gfs2_ondisk.h>
#include <linux/lm_interface.h>
+#include <linux/bio.h>
#include "gfs2.h"
#include "incore.h"
@@ -172,26 +173,6 @@ static void inode_go_sync(struct gfs2_glock *gl)
}
/**
- * inode_go_xmote_bh - After promoting/demoting a glock
- * @gl: the glock
- *
- */
-
-static void inode_go_xmote_bh(struct gfs2_glock *gl)
-{
- struct gfs2_holder *gh = gl->gl_req_gh;
- struct buffer_head *bh;
- int error;
-
- if (gl->gl_state != LM_ST_UNLOCKED &&
- (!gh || !(gh->gh_flags & GL_SKIP))) {
- error = gfs2_meta_read(gl, gl->gl_name.ln_number, 0, &bh);
- if (!error)
- brelse(bh);
- }
-}
-
-/**
* inode_go_inval - prepare a inode glock to be released
* @gl: the glock
* @flags:
@@ -267,6 +248,26 @@ static int inode_go_lock(struct gfs2_holder *gh)
}
/**
+ * inode_go_dump - print information about an inode
+ * @seq: The iterator
+ * @ip: the inode
+ *
+ * Returns: 0 on success, -ENOBUFS when we run out of space
+ */
+
+static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
+{
+ const struct gfs2_inode *ip = gl->gl_object;
+ if (ip == NULL)
+ return 0;
+ gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%08lx\n",
+ (unsigned long long)ip->i_no_formal_ino,
+ (unsigned long long)ip->i_no_addr,
+ IF2DT(ip->i_inode.i_mode), ip->i_flags);
+ return 0;
+}
+
+/**
* rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock
* @gl: the glock
*
@@ -306,6 +307,22 @@ static void rgrp_go_unlock(struct gfs2_holder *gh)
}
/**
+ * rgrp_go_dump - print out an rgrp
+ * @seq: The iterator
+ * @gl: The glock in question
+ *
+ */
+
+static int rgrp_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
+{
+ const struct gfs2_rgrpd *rgd = gl->gl_object;
+ if (rgd == NULL)
+ return 0;
+ gfs2_print_dbg(seq, " R: n:%llu\n", (unsigned long long)rgd->rd_addr);
+ return 0;
+}
+
+/**
* trans_go_sync - promote/demote the transaction glock
* @gl: the glock
* @state: the requested state
@@ -330,7 +347,7 @@ static void trans_go_sync(struct gfs2_glock *gl)
*
*/
-static void trans_go_xmote_bh(struct gfs2_glock *gl)
+static int trans_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh)
{
struct gfs2_sbd *sdp = gl->gl_sbd;
struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
@@ -338,8 +355,7 @@ static void trans_go_xmote_bh(struct gfs2_glock *gl)
struct gfs2_log_header_host head;
int error;
- if (gl->gl_state != LM_ST_UNLOCKED &&
- test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
+ if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
error = gfs2_find_jhead(sdp->sd_jdesc, &head);
@@ -354,6 +370,7 @@ static void trans_go_xmote_bh(struct gfs2_glock *gl)
gfs2_log_pointers_init(sdp, head.lh_blkno);
}
}
+ return 0;
}
/**
@@ -375,12 +392,12 @@ const struct gfs2_glock_operations gfs2_meta_glops = {
const struct gfs2_glock_operations gfs2_inode_glops = {
.go_xmote_th = inode_go_sync,
- .go_xmote_bh = inode_go_xmote_bh,
.go_inval = inode_go_inval,
.go_demote_ok = inode_go_demote_ok,
.go_lock = inode_go_lock,
+ .go_dump = inode_go_dump,
.go_type = LM_TYPE_INODE,
- .go_min_hold_time = HZ / 10,
+ .go_min_hold_time = HZ / 5,
};
const struct gfs2_glock_operations gfs2_rgrp_glops = {
@@ -389,8 +406,9 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
.go_demote_ok = rgrp_go_demote_ok,
.go_lock = rgrp_go_lock,
.go_unlock = rgrp_go_unlock,
+ .go_dump = rgrp_go_dump,
.go_type = LM_TYPE_RGRP,
- .go_min_hold_time = HZ / 10,
+ .go_min_hold_time = HZ / 5,
};
const struct gfs2_glock_operations gfs2_trans_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index eabe5eac41d..f566ec1b4e8 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -77,7 +77,6 @@ struct gfs2_rgrp_host {
struct gfs2_rgrpd {
struct list_head rd_list; /* Link with superblock */
struct list_head rd_list_mru;
- struct list_head rd_recent; /* Recently used rgrps */
struct gfs2_glock *rd_gl; /* Glock for this rgrp */
u64 rd_addr; /* grp block disk address */
u64 rd_data0; /* first data location */
@@ -128,20 +127,20 @@ struct gfs2_bufdata {
struct gfs2_glock_operations {
void (*go_xmote_th) (struct gfs2_glock *gl);
- void (*go_xmote_bh) (struct gfs2_glock *gl);
+ int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh);
void (*go_inval) (struct gfs2_glock *gl, int flags);
int (*go_demote_ok) (struct gfs2_glock *gl);
int (*go_lock) (struct gfs2_holder *gh);
void (*go_unlock) (struct gfs2_holder *gh);
+ int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
const int go_type;
const unsigned long go_min_hold_time;
};
enum {
/* States */
- HIF_HOLDER = 6,
+ HIF_HOLDER = 6, /* Set for gh that "holds" the glock */
HIF_FIRST = 7,
- HIF_ABORTED = 9,
HIF_WAIT = 10,
};
@@ -154,20 +153,20 @@ struct gfs2_holder {
unsigned gh_flags;
int gh_error;
- unsigned long gh_iflags;
+ unsigned long gh_iflags; /* HIF_... */
unsigned long gh_ip;
};
enum {
- GLF_LOCK = 1,
- GLF_STICKY = 2,
- GLF_DEMOTE = 3,
- GLF_PENDING_DEMOTE = 4,
- GLF_DIRTY = 5,
- GLF_DEMOTE_IN_PROGRESS = 6,
- GLF_LFLUSH = 7,
- GLF_WAITERS2 = 8,
- GLF_CONV_DEADLK = 9,
+ GLF_LOCK = 1,
+ GLF_STICKY = 2,
+ GLF_DEMOTE = 3,
+ GLF_PENDING_DEMOTE = 4,
+ GLF_DEMOTE_IN_PROGRESS = 5,
+ GLF_DIRTY = 6,
+ GLF_LFLUSH = 7,
+ GLF_INVALIDATE_IN_PROGRESS = 8,
+ GLF_REPLY_PENDING = 9,
};
struct gfs2_glock {
@@ -179,19 +178,14 @@ struct gfs2_glock {
spinlock_t gl_spin;
unsigned int gl_state;
+ unsigned int gl_target;
+ unsigned int gl_reply;
unsigned int gl_hash;
unsigned int gl_demote_state; /* state requested by remote node */
unsigned long gl_demote_time; /* time of first demote request */
- struct pid *gl_owner_pid;
- unsigned long gl_ip;
struct list_head gl_holders;
- struct list_head gl_waiters1; /* HIF_MUTEX */
- struct list_head gl_waiters3; /* HIF_PROMOTE */
const struct gfs2_glock_operations *gl_ops;
-
- struct gfs2_holder *gl_req_gh;
-
void *gl_lock;
char *gl_lvb;
atomic_t gl_lvb_count;
@@ -392,20 +386,21 @@ struct gfs2_statfs_change_host {
#define GFS2_DATA_ORDERED 2
struct gfs2_args {
- char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */
- char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */
- char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */
- int ar_spectator; /* Don't get a journal because we're always RO */
- int ar_ignore_local_fs; /* Don't optimize even if local_fs is 1 */
- int ar_localflocks; /* Let the VFS do flock|fcntl locks for us */
- int ar_localcaching; /* Local-style caching (dangerous on multihost) */
- int ar_debug; /* Oops on errors instead of trying to be graceful */
- int ar_upgrade; /* Upgrade ondisk/multihost format */
- unsigned int ar_num_glockd; /* Number of glockd threads */
- int ar_posix_acl; /* Enable posix acls */
- int ar_quota; /* off/account/on */
- int ar_suiddir; /* suiddir support */
- int ar_data; /* ordered/writeback */
+ char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */
+ char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */
+ char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */
+ unsigned int ar_spectator:1; /* Don't get a journal */
+ unsigned int ar_ignore_local_fs:1; /* Ignore optimisations */
+ unsigned int ar_localflocks:1; /* Let the VFS do flock|fcntl */
+ unsigned int ar_localcaching:1; /* Local caching */
+ unsigned int ar_debug:1; /* Oops on errors */
+ unsigned int ar_upgrade:1; /* Upgrade ondisk format */
+ unsigned int ar_posix_acl:1; /* Enable posix acls */
+ unsigned int ar_quota:2; /* off/account/on */
+ unsigned int ar_suiddir:1; /* suiddir support */
+ unsigned int ar_data:2; /* ordered/writeback */
+ unsigned int ar_meta:1; /* mount metafs */
+ unsigned int ar_num_glockd; /* Number of glockd threads */
};
struct gfs2_tune {
@@ -425,9 +420,7 @@ struct gfs2_tune {
unsigned int gt_quota_scale_den; /* Denominator */
unsigned int gt_quota_cache_secs;
unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
- unsigned int gt_atime_quantum; /* Min secs between atime updates */
unsigned int gt_new_files_jdata;
- unsigned int gt_new_files_directio;
unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
unsigned int gt_stall_secs; /* Detects trouble! */
unsigned int gt_complain_secs;
@@ -439,7 +432,7 @@ enum {
SDF_JOURNAL_CHECKED = 0,
SDF_JOURNAL_LIVE = 1,
SDF_SHUTDOWN = 2,
- SDF_NOATIME = 3,
+ SDF_NOBARRIERS = 3,
};
#define GFS2_FSNAME_LEN 256
@@ -468,7 +461,6 @@ struct gfs2_sb_host {
struct gfs2_sbd {
struct super_block *sd_vfs;
- struct super_block *sd_vfs_meta;
struct kobject sd_kobj;
unsigned long sd_flags; /* SDF_... */
struct gfs2_sb_host sd_sb;
@@ -506,7 +498,9 @@ struct gfs2_sbd {
/* Inode Stuff */
- struct inode *sd_master_dir;
+ struct dentry *sd_master_dir;
+ struct dentry *sd_root_dir;
+
struct inode *sd_jindex;
struct inode *sd_inum_inode;
struct inode *sd_statfs_inode;
@@ -534,7 +528,6 @@ struct gfs2_sbd {
struct mutex sd_rindex_mutex;
struct list_head sd_rindex_list;
struct list_head sd_rindex_mru_list;
- struct list_head sd_rindex_recent_list;
struct gfs2_rgrpd *sd_rindex_forward;
unsigned int sd_rgrps;
@@ -642,7 +635,6 @@ struct gfs2_sbd {
/* Debugging crud */
unsigned long sd_last_warning;
- struct vfsmount *sd_gfs2mnt;
struct dentry *debugfs_dir; /* debugfs directory */
struct dentry *debugfs_dentry_glocks; /* for debugfs */
};
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 09453d057e4..7cee695fa44 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -18,6 +18,7 @@
#include <linux/crc32.h>
#include <linux/lm_interface.h>
#include <linux/security.h>
+#include <linux/time.h>
#include "gfs2.h"
#include "incore.h"
@@ -249,6 +250,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
{
struct gfs2_dinode_host *di = &ip->i_di;
const struct gfs2_dinode *str = buf;
+ struct timespec atime;
u16 height, depth;
if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr)))
@@ -275,8 +277,10 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
di->di_size = be64_to_cpu(str->di_size);
i_size_write(&ip->i_inode, di->di_size);
gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
- ip->i_inode.i_atime.tv_sec = be64_to_cpu(str->di_atime);
- ip->i_inode.i_atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
+ atime.tv_sec = be64_to_cpu(str->di_atime);
+ atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
+ if (timespec_compare(&ip->i_inode.i_atime, &atime) < 0)
+ ip->i_inode.i_atime = atime;
ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime);
ip->i_inode.i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec);
ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime);
@@ -448,7 +452,7 @@ struct inode *gfs2_lookup_simple(struct inode *dip, const char *name)
struct qstr qstr;
struct inode *inode;
gfs2_str2qstr(&qstr, name);
- inode = gfs2_lookupi(dip, &qstr, 1, NULL);
+ inode = gfs2_lookupi(dip, &qstr, 1);
/* gfs2_lookupi has inconsistent callers: vfs
* related routines expect NULL for no entry found,
* gfs2_lookup_simple callers expect ENOENT
@@ -477,7 +481,7 @@ struct inode *gfs2_lookup_simple(struct inode *dip, const char *name)
*/
struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
- int is_root, struct nameidata *nd)
+ int is_root)
{
struct super_block *sb = dir->i_sb;
struct gfs2_inode *dip = GFS2_I(dir);
@@ -504,7 +508,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
}
if (!is_root) {
- error = permission(dir, MAY_EXEC, NULL);
+ error = gfs2_permission(dir, MAY_EXEC);
if (error)
goto out;
}
@@ -667,7 +671,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
{
int error;
- error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL);
+ error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
if (error)
return error;
@@ -789,13 +793,8 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA) ||
gfs2_tune_get(sdp, gt_new_files_jdata))
di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
- if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_DIRECTIO) ||
- gfs2_tune_get(sdp, gt_new_files_directio))
- di->di_flags |= cpu_to_be32(GFS2_DIF_DIRECTIO);
} else if (S_ISDIR(mode)) {
di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
- GFS2_DIF_INHERIT_DIRECTIO);
- di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
GFS2_DIF_INHERIT_JDATA);
}
@@ -1038,13 +1037,11 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
if (bh)
brelse(bh);
- if (!inode)
- return ERR_PTR(-ENOMEM);
return inode;
fail_gunlock2:
gfs2_glock_dq_uninit(ghs + 1);
- if (inode)
+ if (inode && !IS_ERR(inode))
iput(inode);
fail_gunlock:
gfs2_glock_dq(ghs);
@@ -1134,7 +1131,7 @@ int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
if (IS_APPEND(&dip->i_inode))
return -EPERM;
- error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL);
+ error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
if (error)
return error;
@@ -1145,54 +1142,6 @@ int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
return 0;
}
-/*
- * gfs2_ok_to_move - check if it's ok to move a directory to another directory
- * @this: move this
- * @to: to here
- *
- * Follow @to back to the root and make sure we don't encounter @this
- * Assumes we already hold the rename lock.
- *
- * Returns: errno
- */
-
-int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
-{
- struct inode *dir = &to->i_inode;
- struct super_block *sb = dir->i_sb;
- struct inode *tmp;
- struct qstr dotdot;
- int error = 0;
-
- gfs2_str2qstr(&dotdot, "..");
-
- igrab(dir);
-
- for (;;) {
- if (dir == &this->i_inode) {
- error = -EINVAL;
- break;
- }
- if (dir == sb->s_root->d_inode) {
- error = 0;
- break;
- }
-
- tmp = gfs2_lookupi(dir, &dotdot, 1, NULL);
- if (IS_ERR(tmp)) {
- error = PTR_ERR(tmp);
- break;
- }
-
- iput(dir);
- dir = tmp;
- }
-
- iput(dir);
-
- return error;
-}
-
/**
* gfs2_readlinki - return the contents of a symlink
* @ip: the symlink's inode
@@ -1212,8 +1161,8 @@ int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
unsigned int x;
int error;
- gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh);
- error = gfs2_glock_nq_atime(&i_gh);
+ gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
+ error = gfs2_glock_nq(&i_gh);
if (error) {
gfs2_holder_uninit(&i_gh);
return error;
@@ -1248,101 +1197,6 @@ out:
return error;
}
-/**
- * gfs2_glock_nq_atime - Acquire a hold on an inode's glock, and
- * conditionally update the inode's atime
- * @gh: the holder to acquire
- *
- * Tests atime (access time) for gfs2_read, gfs2_readdir and gfs2_mmap
- * Update if the difference between the current time and the inode's current
- * atime is greater than an interval specified at mount.
- *
- * Returns: errno
- */
-
-int gfs2_glock_nq_atime(struct gfs2_holder *gh)
-{
- struct gfs2_glock *gl = gh->gh_gl;
- struct gfs2_sbd *sdp = gl->gl_sbd;
- struct gfs2_inode *ip = gl->gl_object;
- s64 quantum = gfs2_tune_get(sdp, gt_atime_quantum);
- unsigned int state;
- int flags;
- int error;
- struct timespec tv = CURRENT_TIME;
-
- if (gfs2_assert_warn(sdp, gh->gh_flags & GL_ATIME) ||
- gfs2_assert_warn(sdp, !(gh->gh_flags & GL_ASYNC)) ||
- gfs2_assert_warn(sdp, gl->gl_ops == &gfs2_inode_glops))
- return -EINVAL;
-
- state = gh->gh_state;
- flags = gh->gh_flags;
-
- error = gfs2_glock_nq(gh);
- if (error)
- return error;
-
- if (test_bit(SDF_NOATIME, &sdp->sd_flags) ||
- (sdp->sd_vfs->s_flags & MS_RDONLY))
- return 0;
-
- if (tv.tv_sec - ip->i_inode.i_atime.tv_sec >= quantum) {
- gfs2_glock_dq(gh);
- gfs2_holder_reinit(LM_ST_EXCLUSIVE, gh->gh_flags & ~LM_FLAG_ANY,
- gh);
- error = gfs2_glock_nq(gh);
- if (error)
- return error;
-
- /* Verify that atime hasn't been updated while we were
- trying to get exclusive lock. */
-
- tv = CURRENT_TIME;
- if (tv.tv_sec - ip->i_inode.i_atime.tv_sec >= quantum) {
- struct buffer_head *dibh;
- struct gfs2_dinode *di;
-
- error = gfs2_trans_begin(sdp, RES_DINODE, 0);
- if (error == -EROFS)
- return 0;
- if (error)
- goto fail;
-
- error = gfs2_meta_inode_buffer(ip, &dibh);
- if (error)
- goto fail_end_trans;
-
- ip->i_inode.i_atime = tv;
-
- gfs2_trans_add_bh(ip->i_gl, dibh, 1);
- di = (struct gfs2_dinode *)dibh->b_data;
- di->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
- di->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec);
- brelse(dibh);
-
- gfs2_trans_end(sdp);
- }
-
- /* If someone else has asked for the glock,
- unlock and let them have it. Then reacquire
- in the original state. */
- if (gfs2_glock_is_blocking(gl)) {
- gfs2_glock_dq(gh);
- gfs2_holder_reinit(state, flags, gh);
- return gfs2_glock_nq(gh);
- }
- }
-
- return 0;
-
-fail_end_trans:
- gfs2_trans_end(sdp);
-fail:
- gfs2_glock_dq(gh);
- return error;
-}
-
static int
__gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
{
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 580da454b38..2d43f69610a 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -72,7 +72,6 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
}
-void gfs2_inode_attr_in(struct gfs2_inode *ip);
void gfs2_set_iop(struct inode *inode);
struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
u64 no_addr, u64 no_formal_ino,
@@ -84,16 +83,15 @@ int gfs2_inode_refresh(struct gfs2_inode *ip);
int gfs2_dinode_dealloc(struct gfs2_inode *inode);
int gfs2_change_nlink(struct gfs2_inode *ip, int diff);
struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
- int is_root, struct nameidata *nd);
+ int is_root);
struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
unsigned int mode, dev_t dev);
int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
struct gfs2_inode *ip);
int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
const struct gfs2_inode *ip);
-int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to);
+int gfs2_permission(struct inode *inode, int mask);
int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
-int gfs2_glock_nq_atime(struct gfs2_holder *gh);
int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
diff --git a/fs/gfs2/locking.c b/fs/gfs2/locking.c
index 663fee72878..523243a13a2 100644
--- a/fs/gfs2/locking.c
+++ b/fs/gfs2/locking.c
@@ -23,12 +23,54 @@ struct lmh_wrapper {
const struct lm_lockops *lw_ops;
};
+static int nolock_mount(char *table_name, char *host_data,
+ lm_callback_t cb, void *cb_data,
+ unsigned int min_lvb_size, int flags,
+ struct lm_lockstruct *lockstruct,
+ struct kobject *fskobj);
+
/* List of registered low-level locking protocols. A file system selects one
of them by name at mount time, e.g. lock_nolock, lock_dlm. */
+static const struct lm_lockops nolock_ops = {
+ .lm_proto_name = "lock_nolock",
+ .lm_mount = nolock_mount,
+};
+
+static struct lmh_wrapper nolock_proto = {
+ .lw_list = LIST_HEAD_INIT(nolock_proto.lw_list),
+ .lw_ops = &nolock_ops,
+};
+
static LIST_HEAD(lmh_list);
static DEFINE_MUTEX(lmh_lock);
+static int nolock_mount(char *table_name, char *host_data,
+ lm_callback_t cb, void *cb_data,
+ unsigned int min_lvb_size, int flags,
+ struct lm_lockstruct *lockstruct,
+ struct kobject *fskobj)
+{
+ char *c;
+ unsigned int jid;
+
+ c = strstr(host_data, "jid=");
+ if (!c)
+ jid = 0;
+ else {
+ c += 4;
+ sscanf(c, "%u", &jid);
+ }
+
+ lockstruct->ls_jid = jid;
+ lockstruct->ls_first = 1;
+ lockstruct->ls_lvb_size = min_lvb_size;
+ lockstruct->ls_ops = &nolock_ops;
+ lockstruct->ls_flags = LM_LSFLAG_LOCAL;
+
+ return 0;
+}
+
/**
* gfs2_register_lockproto - Register a low-level locking protocol
* @proto: the protocol definition
@@ -116,9 +158,13 @@ int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data,
int try = 0;
int error, found;
+
retry:
mutex_lock(&lmh_lock);
+ if (list_empty(&nolock_proto.lw_list))
+ list_add(&nolock_proto.lw_list, &lmh_list);
+
found = 0;
list_for_each_entry(lw, &lmh_list, lw_list) {
if (!strcmp(lw->lw_ops->lm_proto_name, proto_name)) {
@@ -139,7 +185,8 @@ retry:
goto out;
}
- if (!try_module_get(lw->lw_ops->lm_owner)) {
+ if (lw->lw_ops->lm_owner &&
+ !try_module_get(lw->lw_ops->lm_owner)) {
try = 0;
mutex_unlock(&lmh_lock);
msleep(1000);
@@ -158,7 +205,8 @@ out:
void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct)
{
mutex_lock(&lmh_lock);
- lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace);
+ if (lockstruct->ls_ops->lm_unmount)
+ lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace);
if (lockstruct->ls_ops->lm_owner)
module_put(lockstruct->ls_ops->lm_owner);
mutex_unlock(&lmh_lock);
diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
index cf7ea8abec8..2482c904750 100644
--- a/fs/gfs2/locking/dlm/lock.c
+++ b/fs/gfs2/locking/dlm/lock.c
@@ -11,46 +11,60 @@
static char junk_lvb[GDLM_LVB_SIZE];
-static void queue_complete(struct gdlm_lock *lp)
+
+/* convert dlm lock-mode to gfs lock-state */
+
+static s16 gdlm_make_lmstate(s16 dlmmode)
{
- struct gdlm_ls *ls = lp->ls;
+ switch (dlmmode) {
+ case DLM_LOCK_IV:
+ case DLM_LOCK_NL:
+ return LM_ST_UNLOCKED;
+ case DLM_LOCK_EX:
+ return LM_ST_EXCLUSIVE;
+ case DLM_LOCK_CW:
+ return LM_ST_DEFERRED;
+ case DLM_LOCK_PR:
+ return LM_ST_SHARED;
+ }
+ gdlm_assert(0, "unknown DLM mode %d", dlmmode);
+ return -1;
+}
- clear_bit(LFL_ACTIVE, &lp->flags);
+/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm
+ thread gets to it. */
+
+static void queue_submit(struct gdlm_lock *lp)
+{
+ struct gdlm_ls *ls = lp->ls;
spin_lock(&ls->async_lock);
- list_add_tail(&lp->clist, &ls->complete);
+ list_add_tail(&lp->delay_list, &ls->submit);
spin_unlock(&ls->async_lock);
wake_up(&ls->thread_wait);
}
-static inline void gdlm_ast(void *astarg)
+static void wake_up_ast(struct gdlm_lock *lp)
{
- queue_complete(astarg);
+ clear_bit(LFL_AST_WAIT, &lp->flags);
+ smp_mb__after_clear_bit();
+ wake_up_bit(&lp->flags, LFL_AST_WAIT);
}
-static inline void gdlm_bast(void *astarg, int mode)
+static void gdlm_delete_lp(struct gdlm_lock *lp)
{
- struct gdlm_lock *lp = astarg;
struct gdlm_ls *ls = lp->ls;
- if (!mode) {
- printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n",
- lp->lockname.ln_type,
- (unsigned long long)lp->lockname.ln_number);
- return;
- }
-
spin_lock(&ls->async_lock);
- if (!lp->bast_mode) {
- list_add_tail(&lp->blist, &ls->blocking);
- lp->bast_mode = mode;
- } else if (lp->bast_mode < mode)
- lp->bast_mode = mode;
+ if (!list_empty(&lp->delay_list))
+ list_del_init(&lp->delay_list);
+ ls->all_locks_count--;
spin_unlock(&ls->async_lock);
- wake_up(&ls->thread_wait);
+
+ kfree(lp);
}
-void gdlm_queue_delayed(struct gdlm_lock *lp)
+static void gdlm_queue_delayed(struct gdlm_lock *lp)
{
struct gdlm_ls *ls = lp->ls;
@@ -59,6 +73,236 @@ void gdlm_queue_delayed(struct gdlm_lock *lp)
spin_unlock(&ls->async_lock);
}
+static void process_complete(struct gdlm_lock *lp)
+{
+ struct gdlm_ls *ls = lp->ls;
+ struct lm_async_cb acb;
+
+ memset(&acb, 0, sizeof(acb));
+
+ if (lp->lksb.sb_status == -DLM_ECANCEL) {
+ log_info("complete dlm cancel %x,%llx flags %lx",
+ lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number,
+ lp->flags);
+
+ lp->req = lp->cur;
+ acb.lc_ret |= LM_OUT_CANCELED;
+ if (lp->cur == DLM_LOCK_IV)
+ lp->lksb.sb_lkid = 0;
+ goto out;
+ }
+
+ if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) {
+ if (lp->lksb.sb_status != -DLM_EUNLOCK) {
+ log_info("unlock sb_status %d %x,%llx flags %lx",
+ lp->lksb.sb_status, lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number,
+ lp->flags);
+ return;
+ }
+
+ lp->cur = DLM_LOCK_IV;
+ lp->req = DLM_LOCK_IV;
+ lp->lksb.sb_lkid = 0;
+
+ if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) {
+ gdlm_delete_lp(lp);
+ return;
+ }
+ goto out;
+ }
+
+ if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID)
+ memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
+
+ if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) {
+ if (lp->req == DLM_LOCK_PR)
+ lp->req = DLM_LOCK_CW;
+ else if (lp->req == DLM_LOCK_CW)
+ lp->req = DLM_LOCK_PR;
+ }
+
+ /*
+ * A canceled lock request. The lock was just taken off the delayed
+ * list and was never even submitted to dlm.
+ */
+
+ if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) {
+ log_info("complete internal cancel %x,%llx",
+ lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number);
+ lp->req = lp->cur;
+ acb.lc_ret |= LM_OUT_CANCELED;
+ goto out;
+ }
+
+ /*
+ * An error occured.
+ */
+
+ if (lp->lksb.sb_status) {
+ /* a "normal" error */
+ if ((lp->lksb.sb_status == -EAGAIN) &&
+ (lp->lkf & DLM_LKF_NOQUEUE)) {
+ lp->req = lp->cur;
+ if (lp->cur == DLM_LOCK_IV)
+ lp->lksb.sb_lkid = 0;
+ goto out;
+ }
+
+ /* this could only happen with cancels I think */
+ log_info("ast sb_status %d %x,%llx flags %lx",
+ lp->lksb.sb_status, lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number,
+ lp->flags);
+ return;
+ }
+
+ /*
+ * This is an AST for an EX->EX conversion for sync_lvb from GFS.
+ */
+
+ if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) {
+ wake_up_ast(lp);
+ return;
+ }
+
+ /*
+ * A lock has been demoted to NL because it initially completed during
+ * BLOCK_LOCKS. Now it must be requested in the originally requested
+ * mode.
+ */
+
+ if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) {
+ gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx",
+ lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number);
+ gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx",
+ lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number);
+
+ lp->cur = DLM_LOCK_NL;
+ lp->req = lp->prev_req;
+ lp->prev_req = DLM_LOCK_IV;
+ lp->lkf &= ~DLM_LKF_CONVDEADLK;
+
+ set_bit(LFL_NOCACHE, &lp->flags);
+
+ if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
+ !test_bit(LFL_NOBLOCK, &lp->flags))
+ gdlm_queue_delayed(lp);
+ else
+ queue_submit(lp);
+ return;
+ }
+
+ /*
+ * A request is granted during dlm recovery. It may be granted
+ * because the locks of a failed node were cleared. In that case,
+ * there may be inconsistent data beneath this lock and we must wait
+ * for recovery to complete to use it. When gfs recovery is done this
+ * granted lock will be converted to NL and then reacquired in this
+ * granted state.
+ */
+
+ if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
+ !test_bit(LFL_NOBLOCK, &lp->flags) &&
+ lp->req != DLM_LOCK_NL) {
+
+ lp->cur = lp->req;
+ lp->prev_req = lp->req;
+ lp->req = DLM_LOCK_NL;
+ lp->lkf |= DLM_LKF_CONVERT;
+ lp->lkf &= ~DLM_LKF_CONVDEADLK;
+
+ log_debug("rereq %x,%llx id %x %d,%d",
+ lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number,
+ lp->lksb.sb_lkid, lp->cur, lp->req);
+
+ set_bit(LFL_REREQUEST, &lp->flags);
+ queue_submit(lp);
+ return;
+ }
+
+ /*
+ * DLM demoted the lock to NL before it was granted so GFS must be
+ * told it cannot cache data for this lock.
+ */
+
+ if (lp->lksb.sb_flags & DLM_SBF_DEMOTED)
+ set_bit(LFL_NOCACHE, &lp->flags);
+
+out:
+ /*
+ * This is an internal lock_dlm lock
+ */
+
+ if (test_bit(LFL_INLOCK, &lp->flags)) {
+ clear_bit(LFL_NOBLOCK, &lp->flags);
+ lp->cur = lp->req;
+ wake_up_ast(lp);
+ return;
+ }
+
+ /*
+ * Normal completion of a lock request. Tell GFS it now has the lock.
+ */
+
+ clear_bit(LFL_NOBLOCK, &lp->flags);
+ lp->cur = lp->req;
+
+ acb.lc_name = lp->lockname;
+ acb.lc_ret |= gdlm_make_lmstate(lp->cur);
+
+ ls->fscb(ls->sdp, LM_CB_ASYNC, &acb);
+}
+
+static void gdlm_ast(void *astarg)
+{
+ struct gdlm_lock *lp = astarg;
+ clear_bit(LFL_ACTIVE, &lp->flags);
+ process_complete(lp);
+}
+
+static void process_blocking(struct gdlm_lock *lp, int bast_mode)
+{
+ struct gdlm_ls *ls = lp->ls;
+ unsigned int cb = 0;
+
+ switch (gdlm_make_lmstate(bast_mode)) {
+ case LM_ST_EXCLUSIVE:
+ cb = LM_CB_NEED_E;
+ break;
+ case LM_ST_DEFERRED:
+ cb = LM_CB_NEED_D;
+ break;
+ case LM_ST_SHARED:
+ cb = LM_CB_NEED_S;
+ break;
+ default:
+ gdlm_assert(0, "unknown bast mode %u", bast_mode);
+ }
+
+ ls->fscb(ls->sdp, cb, &lp->lockname);
+}
+
+
+static void gdlm_bast(void *astarg, int mode)
+{
+ struct gdlm_lock *lp = astarg;
+
+ if (!mode) {
+ printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n",
+ lp->lockname.ln_type,
+ (unsigned long long)lp->lockname.ln_number);
+ return;
+ }
+
+ process_blocking(lp, mode);
+}
+
/* convert gfs lock-state to dlm lock-mode */
static s16 make_mode(s16 lmstate)
@@ -77,24 +321,6 @@ static s16 make_mode(s16 lmstate)
return -1;
}
-/* convert dlm lock-mode to gfs lock-state */
-
-s16 gdlm_make_lmstate(s16 dlmmode)
-{
- switch (dlmmode) {
- case DLM_LOCK_IV:
- case DLM_LOCK_NL:
- return LM_ST_UNLOCKED;
- case DLM_LOCK_EX:
- return LM_ST_EXCLUSIVE;
- case DLM_LOCK_CW:
- return LM_ST_DEFERRED;
- case DLM_LOCK_PR:
- return LM_ST_SHARED;
- }
- gdlm_assert(0, "unknown DLM mode %d", dlmmode);
- return -1;
-}
/* verify agreement with GFS on the current lock state, NB: DLM_LOCK_NL and
DLM_LOCK_IV are both considered LM_ST_UNLOCKED by GFS. */
@@ -134,14 +360,6 @@ static inline unsigned int make_flags(struct gdlm_lock *lp,
if (lp->lksb.sb_lkid != 0) {
lkf |= DLM_LKF_CONVERT;
-
- /* Conversion deadlock avoidance by DLM */
-
- if (!(lp->ls->fsflags & LM_MFLAG_CONV_NODROP) &&
- !test_bit(LFL_FORCE_PROMOTE, &lp->flags) &&
- !(lkf & DLM_LKF_NOQUEUE) &&
- cur > DLM_LOCK_NL && req > DLM_LOCK_NL && cur != req)
- lkf |= DLM_LKF_CONVDEADLK;
}
if (lp->lvb)
@@ -173,14 +391,9 @@ static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
make_strname(name, &lp->strname);
lp->ls = ls;
lp->cur = DLM_LOCK_IV;
- lp->lvb = NULL;
- lp->hold_null = NULL;
- INIT_LIST_HEAD(&lp->clist);
- INIT_LIST_HEAD(&lp->blist);
INIT_LIST_HEAD(&lp->delay_list);
spin_lock(&ls->async_lock);
- list_add(&lp->all_list, &ls->all_locks);
ls->all_locks_count++;
spin_unlock(&ls->async_lock);
@@ -188,26 +401,6 @@ static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
return 0;
}
-void gdlm_delete_lp(struct gdlm_lock *lp)
-{
- struct gdlm_ls *ls = lp->ls;
-
- spin_lock(&ls->async_lock);
- if (!list_empty(&lp->clist))
- list_del_init(&lp->clist);
- if (!list_empty(&lp->blist))
- list_del_init(&lp->blist);
- if (!list_empty(&lp->delay_list))
- list_del_init(&lp->delay_list);
- gdlm_assert(!list_empty(&lp->all_list), "%x,%llx", lp->lockname.ln_type,
- (unsigned long long)lp->lockname.ln_number);
- list_del_init(&lp->all_list);
- ls->all_locks_count--;
- spin_unlock(&ls->async_lock);
-
- kfree(lp);
-}
-
int gdlm_get_lock(void *lockspace, struct lm_lockname *name,
void **lockp)
{
@@ -261,7 +454,7 @@ unsigned int gdlm_do_lock(struct gdlm_lock *lp)
if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) {
lp->lksb.sb_status = -EAGAIN;
- queue_complete(lp);
+ gdlm_ast(lp);
error = 0;
}
@@ -308,6 +501,12 @@ unsigned int gdlm_lock(void *lock, unsigned int cur_state,
{
struct gdlm_lock *lp = lock;
+ if (req_state == LM_ST_UNLOCKED)
+ return gdlm_unlock(lock, cur_state);
+
+ if (req_state == LM_ST_UNLOCKED)
+ return gdlm_unlock(lock, cur_state);
+
clear_bit(LFL_DLM_CANCEL, &lp->flags);
if (flags & LM_FLAG_NOEXP)
set_bit(LFL_NOBLOCK, &lp->flags);
@@ -351,7 +550,7 @@ void gdlm_cancel(void *lock)
if (delay_list) {
set_bit(LFL_CANCEL, &lp->flags);
set_bit(LFL_ACTIVE, &lp->flags);
- queue_complete(lp);
+ gdlm_ast(lp);
return;
}
@@ -507,22 +706,3 @@ void gdlm_submit_delayed(struct gdlm_ls *ls)
wake_up(&ls->thread_wait);
}
-int gdlm_release_all_locks(struct gdlm_ls *ls)
-{
- struct gdlm_lock *lp, *safe;
- int count = 0;
-
- spin_lock(&ls->async_lock);
- list_for_each_entry_safe(lp, safe, &ls->all_locks, all_list) {
- list_del_init(&lp->all_list);
-
- if (lp->lvb && lp->lvb != junk_lvb)
- kfree(lp->lvb);
- kfree(lp);
- count++;
- }
- spin_unlock(&ls->async_lock);
-
- return count;
-}
-
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
index a243cf69c54..3c98e7c6f93 100644
--- a/fs/gfs2/locking/dlm/lock_dlm.h
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -72,19 +72,12 @@ struct gdlm_ls {
int recover_jid_done;
int recover_jid_status;
spinlock_t async_lock;
- struct list_head complete;
- struct list_head blocking;
struct list_head delayed;
struct list_head submit;
- struct list_head all_locks;
u32 all_locks_count;
wait_queue_head_t wait_control;
- struct task_struct *thread1;
- struct task_struct *thread2;
+ struct task_struct *thread;
wait_queue_head_t thread_wait;
- unsigned long drop_time;
- int drop_locks_count;
- int drop_locks_period;
};
enum {
@@ -117,12 +110,7 @@ struct gdlm_lock {
u32 lkf; /* dlm flags DLM_LKF_ */
unsigned long flags; /* lock_dlm flags LFL_ */
- int bast_mode; /* protected by async_lock */
-
- struct list_head clist; /* complete */
- struct list_head blist; /* blocking */
struct list_head delay_list; /* delayed */
- struct list_head all_list; /* all locks for the fs */
struct gdlm_lock *hold_null; /* NL lock for hold_lvb */
};
@@ -159,11 +147,7 @@ void gdlm_release_threads(struct gdlm_ls *);
/* lock.c */
-s16 gdlm_make_lmstate(s16);
-void gdlm_queue_delayed(struct gdlm_lock *);
void gdlm_submit_delayed(struct gdlm_ls *);
-int gdlm_release_all_locks(struct gdlm_ls *);
-void gdlm_delete_lp(struct gdlm_lock *);
unsigned int gdlm_do_lock(struct gdlm_lock *);
int gdlm_get_lock(void *, struct lm_lockname *, void **);
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index 470bdf650b5..0c4cbe6c828 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -22,22 +22,14 @@ static struct gdlm_ls *init_gdlm(lm_callback_t cb, struct gfs2_sbd *sdp,
if (!ls)
return NULL;
- ls->drop_locks_count = GDLM_DROP_COUNT;
- ls->drop_locks_period = GDLM_DROP_PERIOD;
ls->fscb = cb;
ls->sdp = sdp;
ls->fsflags = flags;
spin_lock_init(&ls->async_lock);
- INIT_LIST_HEAD(&ls->complete);
- INIT_LIST_HEAD(&ls->blocking);
INIT_LIST_HEAD(&ls->delayed);
INIT_LIST_HEAD(&ls->submit);
- INIT_LIST_HEAD(&ls->all_locks);
init_waitqueue_head(&ls->thread_wait);
init_waitqueue_head(&ls->wait_control);
- ls->thread1 = NULL;
- ls->thread2 = NULL;
- ls->drop_time = jiffies;
ls->jid = -1;
strncpy(buf, table_name, 256);
@@ -152,7 +144,8 @@ static int gdlm_mount(char *table_name, char *host_data,
error = dlm_new_lockspace(ls->fsname, strlen(ls->fsname),
&ls->dlm_lockspace,
- DLM_LSFL_FS | (nodir ? DLM_LSFL_NODIR : 0),
+ DLM_LSFL_FS | DLM_LSFL_NEWEXCL |
+ (nodir ? DLM_LSFL_NODIR : 0),
GDLM_LVB_SIZE);
if (error) {
log_error("dlm_new_lockspace error %d", error);
@@ -180,7 +173,6 @@ out:
static void gdlm_unmount(void *lockspace)
{
struct gdlm_ls *ls = lockspace;
- int rv;
log_debug("unmount flags %lx", ls->flags);
@@ -194,9 +186,7 @@ static void gdlm_unmount(void *lockspace)
gdlm_kobject_release(ls);
dlm_release_lockspace(ls->dlm_lockspace, 2);
gdlm_release_threads(ls);
- rv = gdlm_release_all_locks(ls);
- if (rv)
- log_info("gdlm_unmount: %d stray locks freed", rv);
+ BUG_ON(ls->all_locks_count);
out:
kfree(ls);
}
@@ -232,7 +222,6 @@ static void gdlm_withdraw(void *lockspace)
dlm_release_lockspace(ls->dlm_lockspace, 2);
gdlm_release_threads(ls);
- gdlm_release_all_locks(ls);
gdlm_kobject_release(ls);
}
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index a4ff271df9e..4ec571c3d8a 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -114,17 +114,6 @@ static ssize_t recover_status_show(struct gdlm_ls *ls, char *buf)
return sprintf(buf, "%d\n", ls->recover_jid_status);
}
-static ssize_t drop_count_show(struct gdlm_ls *ls, char *buf)
-{
- return sprintf(buf, "%d\n", ls->drop_locks_count);
-}
-
-static ssize_t drop_count_store(struct gdlm_ls *ls, const char *buf, size_t len)
-{
- ls->drop_locks_count = simple_strtol(buf, NULL, 0);
- return len;
-}
-
struct gdlm_attr {
struct attribute attr;
ssize_t (*show)(struct gdlm_ls *, char *);
@@ -144,7 +133,6 @@ GDLM_ATTR(first_done, 0444, first_done_show, NULL);
GDLM_ATTR(recover, 0644, recover_show, recover_store);
GDLM_ATTR(recover_done, 0444, recover_done_show, NULL);
GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
-GDLM_ATTR(drop_count, 0644, drop_count_show, drop_count_store);
static struct attribute *gdlm_attrs[] = {
&gdlm_attr_proto_name.attr,
@@ -157,7 +145,6 @@ static struct attribute *gdlm_attrs[] = {
&gdlm_attr_recover.attr,
&gdlm_attr_recover_done.attr,
&gdlm_attr_recover_status.attr,
- &gdlm_attr_drop_count.attr,
NULL,
};
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
index e53db6fd28a..38823efd698 100644
--- a/fs/gfs2/locking/dlm/thread.c
+++ b/fs/gfs2/locking/dlm/thread.c
@@ -9,367 +9,60 @@
#include "lock_dlm.h"
-/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm
- thread gets to it. */
-
-static void queue_submit(struct gdlm_lock *lp)
-{
- struct gdlm_ls *ls = lp->ls;
-
- spin_lock(&ls->async_lock);
- list_add_tail(&lp->delay_list, &ls->submit);
- spin_unlock(&ls->async_lock);
- wake_up(&ls->thread_wait);
-}
-
-static void process_blocking(struct gdlm_lock *lp, int bast_mode)
-{
- struct gdlm_ls *ls = lp->ls;
- unsigned int cb = 0;
-
- switch (gdlm_make_lmstate(bast_mode)) {
- case LM_ST_EXCLUSIVE:
- cb = LM_CB_NEED_E;
- break;
- case LM_ST_DEFERRED:
- cb = LM_CB_NEED_D;
- break;
- case LM_ST_SHARED:
- cb = LM_CB_NEED_S;
- break;
- default:
- gdlm_assert(0, "unknown bast mode %u", lp->bast_mode);
- }
-
- ls->fscb(ls->sdp, cb, &lp->lockname);
-}
-
-static void wake_up_ast(struct gdlm_lock *lp)
-{
- clear_bit(LFL_AST_WAIT, &lp->flags);
- smp_mb__after_clear_bit();
- wake_up_bit(&lp->flags, LFL_AST_WAIT);
-}
-
-static void process_complete(struct gdlm_lock *lp)
-{
- struct gdlm_ls *ls = lp->ls;
- struct lm_async_cb acb;
- s16 prev_mode = lp->cur;
-
- memset(&acb, 0, sizeof(acb));
-
- if (lp->lksb.sb_status == -DLM_ECANCEL) {
- log_info("complete dlm cancel %x,%llx flags %lx",
- lp->lockname.ln_type,
- (unsigned long long)lp->lockname.ln_number,
- lp->flags);
-
- lp->req = lp->cur;
- acb.lc_ret |= LM_OUT_CANCELED;
- if (lp->cur == DLM_LOCK_IV)
- lp->lksb.sb_lkid = 0;
- goto out;
- }
-
- if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) {
- if (lp->lksb.sb_status != -DLM_EUNLOCK) {
- log_info("unlock sb_status %d %x,%llx flags %lx",
- lp->lksb.sb_status, lp->lockname.ln_type,
- (unsigned long long)lp->lockname.ln_number,
- lp->flags);
- return;
- }
-
- lp->cur = DLM_LOCK_IV;
- lp->req = DLM_LOCK_IV;
- lp->lksb.sb_lkid = 0;
-
- if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) {
- gdlm_delete_lp(lp);
- return;
- }
- goto out;
- }
-
- if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID)
- memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
-
- if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) {
- if (lp->req == DLM_LOCK_PR)
- lp->req = DLM_LOCK_CW;
- else if (lp->req == DLM_LOCK_CW)
- lp->req = DLM_LOCK_PR;
- }
-
- /*
- * A canceled lock request. The lock was just taken off the delayed
- * list and was never even submitted to dlm.
- */
-
- if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) {
- log_info("complete internal cancel %x,%llx",
- lp->lockname.ln_type,
- (unsigned long long)lp->lockname.ln_number);
- lp->req = lp->cur;
- acb.lc_ret |= LM_OUT_CANCELED;
- goto out;
- }
-
- /*
- * An error occured.
- */
-
- if (lp->lksb.sb_status) {
- /* a "normal" error */
- if ((lp->lksb.sb_status == -EAGAIN) &&
- (lp->lkf & DLM_LKF_NOQUEUE)) {
- lp->req = lp->cur;
- if (lp->cur == DLM_LOCK_IV)
- lp->lksb.sb_lkid = 0;
- goto out;
- }
-
- /* this could only happen with cancels I think */
- log_info("ast sb_status %d %x,%llx flags %lx",
- lp->lksb.sb_status, lp->lockname.ln_type,
- (unsigned long long)lp->lockname.ln_number,
- lp->flags);
- if (lp->lksb.sb_status == -EDEADLOCK &&
- lp->ls->fsflags & LM_MFLAG_CONV_NODROP) {
- lp->req = lp->cur;
- acb.lc_ret |= LM_OUT_CONV_DEADLK;
- if (lp->cur == DLM_LOCK_IV)
- lp->lksb.sb_lkid = 0;
- goto out;
- } else
- return;
- }
-
- /*
- * This is an AST for an EX->EX conversion for sync_lvb from GFS.
- */
-
- if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) {
- wake_up_ast(lp);
- return;
- }
-
- /*
- * A lock has been demoted to NL because it initially completed during
- * BLOCK_LOCKS. Now it must be requested in the originally requested
- * mode.
- */
-
- if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) {
- gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx",
- lp->lockname.ln_type,
- (unsigned long long)lp->lockname.ln_number);
- gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx",
- lp->lockname.ln_type,
- (unsigned long long)lp->lockname.ln_number);
-
- lp->cur = DLM_LOCK_NL;
- lp->req = lp->prev_req;
- lp->prev_req = DLM_LOCK_IV;
- lp->lkf &= ~DLM_LKF_CONVDEADLK;
-
- set_bit(LFL_NOCACHE, &lp->flags);
-
- if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
- !test_bit(LFL_NOBLOCK, &lp->flags))
- gdlm_queue_delayed(lp);
- else
- queue_submit(lp);
- return;
- }
-
- /*
- * A request is granted during dlm recovery. It may be granted
- * because the locks of a failed node were cleared. In that case,
- * there may be inconsistent data beneath this lock and we must wait
- * for recovery to complete to use it. When gfs recovery is done this
- * granted lock will be converted to NL and then reacquired in this
- * granted state.
- */
-
- if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
- !test_bit(LFL_NOBLOCK, &lp->flags) &&
- lp->req != DLM_LOCK_NL) {
-
- lp->cur = lp->req;
- lp->prev_req = lp->req;
- lp->req = DLM_LOCK_NL;
- lp->lkf |= DLM_LKF_CONVERT;
- lp->lkf &= ~DLM_LKF_CONVDEADLK;
-
- log_debug("rereq %x,%llx id %x %d,%d",
- lp->lockname.ln_type,
- (unsigned long long)lp->lockname.ln_number,
- lp->lksb.sb_lkid, lp->cur, lp->req);
-
- set_bit(LFL_REREQUEST, &lp->flags);
- queue_submit(lp);
- return;
- }
-
- /*
- * DLM demoted the lock to NL before it was granted so GFS must be
- * told it cannot cache data for this lock.
- */
-
- if (lp->lksb.sb_flags & DLM_SBF_DEMOTED)
- set_bit(LFL_NOCACHE, &lp->flags);
-
-out:
- /*
- * This is an internal lock_dlm lock
- */
-
- if (test_bit(LFL_INLOCK, &lp->flags)) {
- clear_bit(LFL_NOBLOCK, &lp->flags);
- lp->cur = lp->req;
- wake_up_ast(lp);
- return;
- }
-
- /*
- * Normal completion of a lock request. Tell GFS it now has the lock.
- */
-
- clear_bit(LFL_NOBLOCK, &lp->flags);
- lp->cur = lp->req;
-
- acb.lc_name = lp->lockname;
- acb.lc_ret |= gdlm_make_lmstate(lp->cur);
-
- if (!test_and_clear_bit(LFL_NOCACHE, &lp->flags) &&
- (lp->cur > DLM_LOCK_NL) && (prev_mode > DLM_LOCK_NL))
- acb.lc_ret |= LM_OUT_CACHEABLE;
-
- ls->fscb(ls->sdp, LM_CB_ASYNC, &acb);
-}
-
-static inline int no_work(struct gdlm_ls *ls, int blocking)
+static inline int no_work(struct gdlm_ls *ls)
{
int ret;
spin_lock(&ls->async_lock);
- ret = list_empty(&ls->complete) && list_empty(&ls->submit);
- if (ret && blocking)
- ret = list_empty(&ls->blocking);
+ ret = list_empty(&ls->submit);
spin_unlock(&ls->async_lock);
return ret;
}
-static inline int check_drop(struct gdlm_ls *ls)
-{
- if (!ls->drop_locks_count)
- return 0;
-
- if (time_after(jiffies, ls->drop_time + ls->drop_locks_period * HZ)) {
- ls->drop_time = jiffies;
- if (ls->all_locks_count >= ls->drop_locks_count)
- return 1;
- }
- return 0;
-}
-
-static int gdlm_thread(void *data, int blist)
+static int gdlm_thread(void *data)
{
struct gdlm_ls *ls = (struct gdlm_ls *) data;
struct gdlm_lock *lp = NULL;
- uint8_t complete, blocking, submit, drop;
-
- /* Only thread1 is allowed to do blocking callbacks since gfs
- may wait for a completion callback within a blocking cb. */
while (!kthread_should_stop()) {
wait_event_interruptible(ls->thread_wait,
- !no_work(ls, blist) || kthread_should_stop());
-
- complete = blocking = submit = drop = 0;
+ !no_work(ls) || kthread_should_stop());
spin_lock(&ls->async_lock);
- if (blist && !list_empty(&ls->blocking)) {
- lp = list_entry(ls->blocking.next, struct gdlm_lock,
- blist);
- list_del_init(&lp->blist);
- blocking = lp->bast_mode;
- lp->bast_mode = 0;
- } else if (!list_empty(&ls->complete)) {
- lp = list_entry(ls->complete.next, struct gdlm_lock,
- clist);
- list_del_init(&lp->clist);
- complete = 1;
- } else if (!list_empty(&ls->submit)) {
+ if (!list_empty(&ls->submit)) {
lp = list_entry(ls->submit.next, struct gdlm_lock,
delay_list);
list_del_init(&lp->delay_list);
- submit = 1;
+ spin_unlock(&ls->async_lock);
+ gdlm_do_lock(lp);
+ spin_lock(&ls->async_lock);
}
-
- drop = check_drop(ls);
spin_unlock(&ls->async_lock);
-
- if (complete)
- process_complete(lp);
-
- else if (blocking)
- process_blocking(lp, blocking);
-
- else if (submit)
- gdlm_do_lock(lp);
-
- if (drop)
- ls->fscb(ls->sdp, LM_CB_DROPLOCKS, NULL);
-
- schedule();
}
return 0;
}
-static int gdlm_thread1(void *data)
-{
- return gdlm_thread(data, 1);
-}
-
-static int gdlm_thread2(void *data)
-{
- return gdlm_thread(data, 0);
-}
-
int gdlm_init_threads(struct gdlm_ls *ls)
{
struct task_struct *p;
int error;
- p = kthread_run(gdlm_thread1, ls, "lock_dlm1");
- error = IS_ERR(p);
- if (error) {
- log_error("can't start lock_dlm1 thread %d", error);
- return error;
- }
- ls->thread1 = p;
-
- p = kthread_run(gdlm_thread2, ls, "lock_dlm2");
+ p = kthread_run(gdlm_thread, ls, "lock_dlm");
error = IS_ERR(p);
if (error) {
- log_error("can't start lock_dlm2 thread %d", error);
- kthread_stop(ls->thread1);
+ log_error("can't start lock_dlm thread %d", error);
return error;
}
- ls->thread2 = p;
+ ls->thread = p;
return 0;
}
void gdlm_release_threads(struct gdlm_ls *ls)
{
- kthread_stop(ls->thread1);
- kthread_stop(ls->thread2);
+ kthread_stop(ls->thread);
}
diff --git a/fs/gfs2/locking/nolock/Makefile b/fs/gfs2/locking/nolock/Makefile
deleted file mode 100644
index 35e9730bc3a..00000000000
--- a/fs/gfs2/locking/nolock/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
-obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += lock_nolock.o
-lock_nolock-y := main.o
-
diff --git a/fs/gfs2/locking/nolock/main.c b/fs/gfs2/locking/nolock/main.c
deleted file mode 100644
index 284a5ece8d9..00000000000
--- a/fs/gfs2/locking/nolock/main.c
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
- * Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/fs.h>
-#include <linux/lm_interface.h>
-
-struct nolock_lockspace {
- unsigned int nl_lvb_size;
-};
-
-static const struct lm_lockops nolock_ops;
-
-static int nolock_mount(char *table_name, char *host_data,
- lm_callback_t cb, void *cb_data,
- unsigned int min_lvb_size, int flags,
- struct lm_lockstruct *lockstruct,
- struct kobject *fskobj)
-{
- char *c;
- unsigned int jid;
- struct nolock_lockspace *nl;
-
- c = strstr(host_data, "jid=");
- if (!c)
- jid = 0;
- else {
- c += 4;
- sscanf(c, "%u", &jid);
- }
-
- nl = kzalloc(sizeof(struct nolock_lockspace), GFP_KERNEL);
- if (!nl)
- return -ENOMEM;
-
- nl->nl_lvb_size = min_lvb_size;
-
- lockstruct->ls_jid = jid;
- lockstruct->ls_first = 1;
- lockstruct->ls_lvb_size = min_lvb_size;
- lockstruct->ls_lockspace = nl;
- lockstruct->ls_ops = &nolock_ops;
- lockstruct->ls_flags = LM_LSFLAG_LOCAL;
-
- return 0;
-}
-
-static void nolock_others_may_mount(void *lockspace)
-{
-}
-
-static void nolock_unmount(void *lockspace)
-{
- struct nolock_lockspace *nl = lockspace;
- kfree(nl);
-}
-
-static void nolock_withdraw(void *lockspace)
-{
-}
-
-/**
- * nolock_get_lock - get a lm_lock_t given a descripton of the lock
- * @lockspace: the lockspace the lock lives in
- * @name: the name of the lock
- * @lockp: return the lm_lock_t here
- *
- * Returns: 0 on success, -EXXX on failure
- */
-
-static int nolock_get_lock(void *lockspace, struct lm_lockname *name,
- void **lockp)
-{
- *lockp = lockspace;
- return 0;
-}
-
-/**
- * nolock_put_lock - get rid of a lock structure
- * @lock: the lock to throw away
- *
- */
-
-static void nolock_put_lock(void *lock)
-{
-}
-
-/**
- * nolock_lock - acquire a lock
- * @lock: the lock to manipulate
- * @cur_state: the current state
- * @req_state: the requested state
- * @flags: modifier flags
- *
- * Returns: A bitmap of LM_OUT_*
- */
-
-static unsigned int nolock_lock(void *lock, unsigned int cur_state,
- unsigned int req_state, unsigned int flags)
-{
- return req_state | LM_OUT_CACHEABLE;
-}
-
-/**
- * nolock_unlock - unlock a lock
- * @lock: the lock to manipulate
- * @cur_state: the current state
- *
- * Returns: 0
- */
-
-static unsigned int nolock_unlock(void *lock, unsigned int cur_state)
-{
- return 0;
-}
-
-static void nolock_cancel(void *lock)
-{
-}
-
-/**
- * nolock_hold_lvb - hold on to a lock value block
- * @lock: the lock the LVB is associated with
- * @lvbp: return the lm_lvb_t here
- *
- * Returns: 0 on success, -EXXX on failure
- */
-
-static int nolock_hold_lvb(void *lock, char **lvbp)
-{
- struct nolock_lockspace *nl = lock;
- int error = 0;
-
- *lvbp = kzalloc(nl->nl_lvb_size, GFP_NOFS);
- if (!*lvbp)
- error = -ENOMEM;
-
- return error;
-}
-
-/**
- * nolock_unhold_lvb - release a LVB
- * @lock: the lock the LVB is associated with
- * @lvb: the lock value block
- *
- */
-
-static void nolock_unhold_lvb(void *lock, char *lvb)
-{
- kfree(lvb);
-}
-
-static int nolock_plock_get(void *lockspace, struct lm_lockname *name,
- struct file *file, struct file_lock *fl)
-{
- posix_test_lock(file, fl);
-
- return 0;
-}
-
-static int nolock_plock(void *lockspace, struct lm_lockname *name,
- struct file *file, int cmd, struct file_lock *fl)
-{
- int error;
- error = posix_lock_file_wait(file, fl);
- return error;
-}
-
-static int nolock_punlock(void *lockspace, struct lm_lockname *name,
- struct file *file, struct file_lock *fl)
-{
- int error;
- error = posix_lock_file_wait(file, fl);
- return error;
-}
-
-static void nolock_recovery_done(void *lockspace, unsigned int jid,
- unsigned int message)
-{
-}
-
-static const struct lm_lockops nolock_ops = {
- .lm_proto_name = "lock_nolock",
- .lm_mount = nolock_mount,
- .lm_others_may_mount = nolock_others_may_mount,
- .lm_unmount = nolock_unmount,
- .lm_withdraw = nolock_withdraw,
- .lm_get_lock = nolock_get_lock,
- .lm_put_lock = nolock_put_lock,
- .lm_lock = nolock_lock,
- .lm_unlock = nolock_unlock,
- .lm_cancel = nolock_cancel,
- .lm_hold_lvb = nolock_hold_lvb,
- .lm_unhold_lvb = nolock_unhold_lvb,
- .lm_plock_get = nolock_plock_get,
- .lm_plock = nolock_plock,
- .lm_punlock = nolock_punlock,
- .lm_recovery_done = nolock_recovery_done,
- .lm_owner = THIS_MODULE,
-};
-
-static int __init init_nolock(void)
-{
- int error;
-
- error = gfs2_register_lockproto(&nolock_ops);
- if (error) {
- printk(KERN_WARNING
- "lock_nolock: can't register protocol: %d\n", error);
- return error;
- }
-
- printk(KERN_INFO
- "Lock_Nolock (built %s %s) installed\n", __DATE__, __TIME__);
- return 0;
-}
-
-static void __exit exit_nolock(void)
-{
- gfs2_unregister_lockproto(&nolock_ops);
-}
-
-module_init(init_nolock);
-module_exit(exit_nolock);
-
-MODULE_DESCRIPTION("GFS Nolock Locking Module");
-MODULE_AUTHOR("Red Hat, Inc.");
-MODULE_LICENSE("GPL");
-
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 548264b1836..ad305854bdc 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -18,6 +18,7 @@
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
+#include <linux/bio.h>
#include "gfs2.h"
#include "incore.h"
@@ -87,6 +88,8 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
*/
static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+__releases(&sdp->sd_log_lock)
+__acquires(&sdp->sd_log_lock)
{
struct gfs2_bufdata *bd, *s;
struct buffer_head *bh;
@@ -582,7 +585,6 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
memset(bh->b_data, 0, bh->b_size);
set_buffer_uptodate(bh);
clear_buffer_dirty(bh);
- unlock_buffer(bh);
gfs2_ail1_empty(sdp, 0);
tail = current_tail(sdp);
@@ -599,8 +601,23 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
hash = gfs2_disk_hash(bh->b_data, sizeof(struct gfs2_log_header));
lh->lh_hash = cpu_to_be32(hash);
- set_buffer_dirty(bh);
- if (sync_dirty_buffer(bh))
+ bh->b_end_io = end_buffer_write_sync;
+ if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
+ goto skip_barrier;
+ get_bh(bh);
+ submit_bh(WRITE_BARRIER | (1 << BIO_RW_META), bh);
+ wait_on_buffer(bh);
+ if (buffer_eopnotsupp(bh)) {
+ clear_buffer_eopnotsupp(bh);
+ set_buffer_uptodate(bh);
+ set_bit(SDF_NOBARRIERS, &sdp->sd_flags);
+ lock_buffer(bh);
+skip_barrier:
+ get_bh(bh);
+ submit_bh(WRITE_SYNC | (1 << BIO_RW_META), bh);
+ wait_on_buffer(bh);
+ }
+ if (!buffer_uptodate(bh))
gfs2_io_error_bh(sdp, bh);
brelse(bh);
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 77115281650..7c64510ccfd 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -21,6 +21,7 @@
*/
static inline void gfs2_log_lock(struct gfs2_sbd *sdp)
+__acquires(&sdp->sd_log_lock)
{
spin_lock(&sdp->sd_log_lock);
}
@@ -32,6 +33,7 @@ static inline void gfs2_log_lock(struct gfs2_sbd *sdp)
*/
static inline void gfs2_log_unlock(struct gfs2_sbd *sdp)
+__releases(&sdp->sd_log_lock)
{
spin_unlock(&sdp->sd_log_lock);
}
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 053e2ebbbd5..bb2cc303ac2 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -24,7 +24,7 @@
#include "util.h"
#include "glock.h"
-static void gfs2_init_inode_once(struct kmem_cache *cachep, void *foo)
+static void gfs2_init_inode_once(void *foo)
{
struct gfs2_inode *ip = foo;
@@ -33,15 +33,13 @@ static void gfs2_init_inode_once(struct kmem_cache *cachep, void *foo)
ip->i_alloc = NULL;
}
-static void gfs2_init_glock_once(struct kmem_cache *cachep, void *foo)
+static void gfs2_init_glock_once(void *foo)
{
struct gfs2_glock *gl = foo;
INIT_HLIST_NODE(&gl->gl_list);
spin_lock_init(&gl->gl_spin);
INIT_LIST_HEAD(&gl->gl_holders);
- INIT_LIST_HEAD(&gl->gl_waiters1);
- INIT_LIST_HEAD(&gl->gl_waiters3);
gl->gl_lvb = NULL;
atomic_set(&gl->gl_lvb_count, 0);
INIT_LIST_HEAD(&gl->gl_reclaim);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 78d75f892f8..09853620c95 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -129,7 +129,7 @@ void gfs2_meta_sync(struct gfs2_glock *gl)
}
/**
- * getbuf - Get a buffer with a given address space
+ * gfs2_getbuf - Get a buffer with a given address space
* @gl: the glock
* @blkno: the block number (filesystem scope)
* @create: 1 if the buffer should be created
@@ -137,7 +137,7 @@ void gfs2_meta_sync(struct gfs2_glock *gl)
* Returns: the buffer
*/
-static struct buffer_head *getbuf(struct gfs2_glock *gl, u64 blkno, int create)
+struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
{
struct address_space *mapping = gl->gl_aspace->i_mapping;
struct gfs2_sbd *sdp = gl->gl_sbd;
@@ -205,7 +205,7 @@ static void meta_prep_new(struct buffer_head *bh)
struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
{
struct buffer_head *bh;
- bh = getbuf(gl, blkno, CREATE);
+ bh = gfs2_getbuf(gl, blkno, CREATE);
meta_prep_new(bh);
return bh;
}
@@ -223,7 +223,7 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
struct buffer_head **bhp)
{
- *bhp = getbuf(gl, blkno, CREATE);
+ *bhp = gfs2_getbuf(gl, blkno, CREATE);
if (!buffer_uptodate(*bhp)) {
ll_rw_block(READ_META, 1, bhp);
if (flags & DIO_WAIT) {
@@ -346,7 +346,7 @@ void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
struct buffer_head *bh;
while (blen) {
- bh = getbuf(ip->i_gl, bstart, NO_CREATE);
+ bh = gfs2_getbuf(ip->i_gl, bstart, NO_CREATE);
if (bh) {
lock_buffer(bh);
gfs2_log_lock(sdp);
@@ -421,7 +421,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
if (extlen > max_ra)
extlen = max_ra;
- first_bh = getbuf(gl, dblock, CREATE);
+ first_bh = gfs2_getbuf(gl, dblock, CREATE);
if (buffer_uptodate(first_bh))
goto out;
@@ -432,7 +432,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
extlen--;
while (extlen) {
- bh = getbuf(gl, dblock, CREATE);
+ bh = gfs2_getbuf(gl, dblock, CREATE);
if (!buffer_uptodate(bh) && !buffer_locked(bh))
ll_rw_block(READA, 1, &bh);
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index 73e3b1c76fe..b1a5f3674d4 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -47,6 +47,7 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno,
int flags, struct buffer_head **bhp);
int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
+struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create);
void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
int meta);
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
index b941f9f9f95..f96eb90a2cf 100644
--- a/fs/gfs2/mount.c
+++ b/fs/gfs2/mount.c
@@ -42,10 +42,11 @@ enum {
Opt_nosuiddir,
Opt_data_writeback,
Opt_data_ordered,
+ Opt_meta,
Opt_err,
};
-static match_table_t tokens = {
+static const match_table_t tokens = {
{Opt_lockproto, "lockproto=%s"},
{Opt_locktable, "locktable=%s"},
{Opt_hostdata, "hostdata=%s"},
@@ -66,6 +67,7 @@ static match_table_t tokens = {
{Opt_nosuiddir, "nosuiddir"},
{Opt_data_writeback, "data=writeback"},
{Opt_data_ordered, "data=ordered"},
+ {Opt_meta, "meta"},
{Opt_err, NULL}
};
@@ -239,6 +241,11 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
case Opt_data_ordered:
args->ar_data = GFS2_DATA_ORDERED;
break;
+ case Opt_meta:
+ if (remount && args->ar_meta != 1)
+ goto cant_remount;
+ args->ar_meta = 1;
+ break;
case Opt_err:
default:
fs_info(sdp, "unknown option: %s\n", o);
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index f55394e57cb..27563816e1c 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -499,34 +499,34 @@ static int __gfs2_readpage(void *file, struct page *page)
* @file: The file to read
* @page: The page of the file
*
- * This deals with the locking required. We use a trylock in order to
- * avoid the page lock / glock ordering problems returning AOP_TRUNCATED_PAGE
- * in the event that we are unable to get the lock.
+ * This deals with the locking required. We have to unlock and
+ * relock the page in order to get the locking in the right
+ * order.
*/
static int gfs2_readpage(struct file *file, struct page *page)
{
- struct gfs2_inode *ip = GFS2_I(page->mapping->host);
- struct gfs2_holder *gh;
+ struct address_space *mapping = page->mapping;
+ struct gfs2_inode *ip = GFS2_I(mapping->host);
+ struct gfs2_holder gh;
int error;
- gh = gfs2_glock_is_locked_by_me(ip->i_gl);
- if (!gh) {
- gh = kmalloc(sizeof(struct gfs2_holder), GFP_NOFS);
- if (!gh)
- return -ENOBUFS;
- gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, gh);
+ unlock_page(page);
+ gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+ error = gfs2_glock_nq(&gh);
+ if (unlikely(error))
+ goto out;
+ error = AOP_TRUNCATED_PAGE;
+ lock_page(page);
+ if (page->mapping == mapping && !PageUptodate(page))
+ error = __gfs2_readpage(file, page);
+ else
unlock_page(page);
- error = gfs2_glock_nq_atime(gh);
- if (likely(error != 0))
- goto out;
- return AOP_TRUNCATED_PAGE;
- }
- error = __gfs2_readpage(file, page);
- gfs2_glock_dq(gh);
+ gfs2_glock_dq(&gh);
out:
- gfs2_holder_uninit(gh);
- kfree(gh);
+ gfs2_holder_uninit(&gh);
+ if (error && error != AOP_TRUNCATED_PAGE)
+ lock_page(page);
return error;
}
@@ -594,8 +594,8 @@ static int gfs2_readpages(struct file *file, struct address_space *mapping,
struct gfs2_holder gh;
int ret;
- gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
- ret = gfs2_glock_nq_atime(&gh);
+ gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+ ret = gfs2_glock_nq(&gh);
if (unlikely(ret))
goto out_uninit;
if (!gfs2_is_stuffed(ip))
@@ -636,8 +636,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
unsigned to = from + len;
struct page *page;
- gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME, &ip->i_gh);
- error = gfs2_glock_nq_atime(&ip->i_gh);
+ gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
+ error = gfs2_glock_nq(&ip->i_gh);
if (unlikely(error))
goto out_uninit;
@@ -975,7 +975,7 @@ static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset)
if (gfs2_is_stuffed(ip))
return 0;
- if (offset > i_size_read(&ip->i_inode))
+ if (offset >= i_size_read(&ip->i_inode))
return 0;
return 1;
}
@@ -1000,8 +1000,8 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
* unfortunately have the option of only flushing a range like
* the VFS does.
*/
- gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, GL_ATIME, &gh);
- rv = gfs2_glock_nq_atime(&gh);
+ gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh);
+ rv = gfs2_glock_nq(&gh);
if (rv)
return rv;
rv = gfs2_ok_for_dio(ip, rw, offset);
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
index 990d9f4bc46..9cda8536530 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/ops_export.c
@@ -134,7 +134,7 @@ static struct dentry *gfs2_get_parent(struct dentry *child)
struct dentry *dentry;
gfs2_str2qstr(&dotdot, "..");
- inode = gfs2_lookupi(child->d_inode, &dotdot, 1, NULL);
+ inode = gfs2_lookupi(child->d_inode, &dotdot, 1);
if (!inode)
return ERR_PTR(-ENOENT);
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index e1b7d525a06..3a747f8e218 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -15,6 +15,7 @@
#include <linux/uio.h>
#include <linux/blkdev.h>
#include <linux/mm.h>
+#include <linux/mount.h>
#include <linux/fs.h>
#include <linux/gfs2_ondisk.h>
#include <linux/ext2_fs.h>
@@ -62,11 +63,11 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
&i_gh);
if (!error) {
- error = remote_llseek(file, offset, origin);
+ error = generic_file_llseek_unlocked(file, offset, origin);
gfs2_glock_dq_uninit(&i_gh);
}
} else
- error = remote_llseek(file, offset, origin);
+ error = generic_file_llseek_unlocked(file, offset, origin);
return error;
}
@@ -88,8 +89,8 @@ static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir)
u64 offset = file->f_pos;
int error;
- gfs2_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh);
- error = gfs2_glock_nq_atime(&d_gh);
+ gfs2_holder_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
+ error = gfs2_glock_nq(&d_gh);
if (error) {
gfs2_holder_uninit(&d_gh);
return error;
@@ -133,7 +134,6 @@ static const u32 fsflags_to_gfs2[32] = {
[7] = GFS2_DIF_NOATIME,
[12] = GFS2_DIF_EXHASH,
[14] = GFS2_DIF_INHERIT_JDATA,
- [20] = GFS2_DIF_INHERIT_DIRECTIO,
};
static const u32 gfs2_to_fsflags[32] = {
@@ -142,7 +142,6 @@ static const u32 gfs2_to_fsflags[32] = {
[gfs2fl_AppendOnly] = FS_APPEND_FL,
[gfs2fl_NoAtime] = FS_NOATIME_FL,
[gfs2fl_ExHash] = FS_INDEX_FL,
- [gfs2fl_InheritDirectio] = FS_DIRECTIO_FL,
[gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL,
};
@@ -154,18 +153,14 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
int error;
u32 fsflags;
- gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
- error = gfs2_glock_nq_atime(&gh);
+ gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
+ error = gfs2_glock_nq(&gh);
if (error)
return error;
fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_di.di_flags);
- if (!S_ISDIR(inode->i_mode)) {
- if (ip->i_di.di_flags & GFS2_DIF_JDATA)
- fsflags |= FS_JOURNAL_DATA_FL;
- if (ip->i_di.di_flags & GFS2_DIF_DIRECTIO)
- fsflags |= FS_DIRECTIO_FL;
- }
+ if (!S_ISDIR(inode->i_mode) && ip->i_di.di_flags & GFS2_DIF_JDATA)
+ fsflags |= FS_JOURNAL_DATA_FL;
if (put_user(fsflags, ptr))
error = -EFAULT;
@@ -194,13 +189,11 @@ void gfs2_set_inode_flags(struct inode *inode)
/* Flags that can be set by user space */
#define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA| \
- GFS2_DIF_DIRECTIO| \
GFS2_DIF_IMMUTABLE| \
GFS2_DIF_APPENDONLY| \
GFS2_DIF_NOATIME| \
GFS2_DIF_SYNC| \
GFS2_DIF_SYSTEM| \
- GFS2_DIF_INHERIT_DIRECTIO| \
GFS2_DIF_INHERIT_JDATA)
/**
@@ -220,10 +213,14 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
int error;
u32 new_flags, flags;
- error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+ error = mnt_want_write(filp->f_path.mnt);
if (error)
return error;
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+ if (error)
+ goto out_drop_write;
+
flags = ip->i_di.di_flags;
new_flags = (flags & ~mask) | (reqflags & mask);
if ((new_flags ^ flags) == 0)
@@ -242,7 +239,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
!capable(CAP_LINUX_IMMUTABLE))
goto out;
if (!IS_IMMUTABLE(inode)) {
- error = permission(inode, MAY_WRITE, NULL);
+ error = gfs2_permission(inode, MAY_WRITE);
if (error)
goto out;
}
@@ -272,6 +269,8 @@ out_trans_end:
gfs2_trans_end(sdp);
out:
gfs2_glock_dq_uninit(&gh);
+out_drop_write:
+ mnt_drop_write(filp->f_path.mnt);
return error;
}
@@ -285,8 +284,6 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
if (!S_ISDIR(inode->i_mode)) {
if (gfsflags & GFS2_DIF_INHERIT_JDATA)
gfsflags ^= (GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA);
- if (gfsflags & GFS2_DIF_INHERIT_DIRECTIO)
- gfsflags ^= (GFS2_DIF_DIRECTIO | GFS2_DIF_INHERIT_DIRECTIO);
return do_gfs2_set_flags(filp, gfsflags, ~0);
}
return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_JDATA);
@@ -354,8 +351,8 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
struct gfs2_alloc *al;
int ret;
- gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME, &gh);
- ret = gfs2_glock_nq_atime(&gh);
+ gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+ ret = gfs2_glock_nq(&gh);
if (ret)
goto out;
@@ -437,8 +434,8 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
struct gfs2_holder i_gh;
int error;
- gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh);
- error = gfs2_glock_nq_atime(&i_gh);
+ gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
+ error = gfs2_glock_nq(&i_gh);
if (error) {
gfs2_holder_uninit(&i_gh);
return error;
@@ -487,11 +484,6 @@ static int gfs2_open(struct inode *inode, struct file *file)
goto fail_gunlock;
}
- /* Listen to the Direct I/O flag */
-
- if (ip->i_di.di_flags & GFS2_DIF_DIRECTIO)
- file->f_flags |= O_DIRECT;
-
gfs2_glock_dq_uninit(&i_gh);
}
@@ -669,8 +661,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
int error = 0;
state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
- flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE
- | GL_FLOCK;
+ flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE;
mutex_lock(&fp->f_fl_mutex);
@@ -683,9 +674,8 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
gfs2_glock_dq_wait(fl_gh);
gfs2_holder_reinit(state, flags, fl_gh);
} else {
- error = gfs2_glock_get(GFS2_SB(&ip->i_inode),
- ip->i_no_addr, &gfs2_flock_glops,
- CREATE, &gl);
+ error = gfs2_glock_get(GFS2_SB(&ip->i_inode), ip->i_no_addr,
+ &gfs2_flock_glops, CREATE, &gl);
if (error)
goto out;
gfs2_holder_init(gl, state, flags, fl_gh);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index b2028c82e8d..b117fcf2c4f 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -40,6 +40,44 @@
#define DO 0
#define UNDO 1
+static const u32 gfs2_old_fs_formats[] = {
+ 0
+};
+
+static const u32 gfs2_old_multihost_formats[] = {
+ 0
+};
+
+/**
+ * gfs2_tune_init - Fill a gfs2_tune structure with default values
+ * @gt: tune
+ *
+ */
+
+static void gfs2_tune_init(struct gfs2_tune *gt)
+{
+ spin_lock_init(&gt->gt_spin);
+
+ gt->gt_demote_secs = 300;
+ gt->gt_incore_log_blocks = 1024;
+ gt->gt_log_flush_secs = 60;
+ gt->gt_recoverd_secs = 60;
+ gt->gt_logd_secs = 1;
+ gt->gt_quotad_secs = 5;
+ gt->gt_quota_simul_sync = 64;
+ gt->gt_quota_warn_period = 10;
+ gt->gt_quota_scale_num = 1;
+ gt->gt_quota_scale_den = 1;
+ gt->gt_quota_cache_secs = 300;
+ gt->gt_quota_quantum = 60;
+ gt->gt_new_files_jdata = 0;
+ gt->gt_max_readahead = 1 << 18;
+ gt->gt_stall_secs = 600;
+ gt->gt_complain_secs = 10;
+ gt->gt_statfs_quantum = 30;
+ gt->gt_statfs_slow = 0;
+}
+
static struct gfs2_sbd *init_sbd(struct super_block *sb)
{
struct gfs2_sbd *sdp;
@@ -64,7 +102,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
mutex_init(&sdp->sd_rindex_mutex);
INIT_LIST_HEAD(&sdp->sd_rindex_list);
INIT_LIST_HEAD(&sdp->sd_rindex_mru_list);
- INIT_LIST_HEAD(&sdp->sd_rindex_recent_list);
INIT_LIST_HEAD(&sdp->sd_jindex_list);
spin_lock_init(&sdp->sd_jindex_spin);
@@ -97,21 +134,271 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
return sdp;
}
-static void init_vfs(struct super_block *sb, unsigned noatime)
+
+/**
+ * gfs2_check_sb - Check superblock
+ * @sdp: the filesystem
+ * @sb: The superblock
+ * @silent: Don't print a message if the check fails
+ *
+ * Checks the version code of the FS is one that we understand how to
+ * read and that the sizes of the various on-disk structures have not
+ * changed.
+ */
+
+static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent)
{
- struct gfs2_sbd *sdp = sb->s_fs_info;
+ unsigned int x;
- sb->s_magic = GFS2_MAGIC;
- sb->s_op = &gfs2_super_ops;
- sb->s_export_op = &gfs2_export_ops;
- sb->s_time_gran = 1;
- sb->s_maxbytes = MAX_LFS_FILESIZE;
+ if (sb->sb_magic != GFS2_MAGIC ||
+ sb->sb_type != GFS2_METATYPE_SB) {
+ if (!silent)
+ printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n");
+ return -EINVAL;
+ }
+
+ /* If format numbers match exactly, we're done. */
+
+ if (sb->sb_fs_format == GFS2_FORMAT_FS &&
+ sb->sb_multihost_format == GFS2_FORMAT_MULTI)
+ return 0;
+
+ if (sb->sb_fs_format != GFS2_FORMAT_FS) {
+ for (x = 0; gfs2_old_fs_formats[x]; x++)
+ if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
+ break;
+
+ if (!gfs2_old_fs_formats[x]) {
+ printk(KERN_WARNING
+ "GFS2: code version (%u, %u) is incompatible "
+ "with ondisk format (%u, %u)\n",
+ GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
+ sb->sb_fs_format, sb->sb_multihost_format);
+ printk(KERN_WARNING
+ "GFS2: I don't know how to upgrade this FS\n");
+ return -EINVAL;
+ }
+ }
+
+ if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
+ for (x = 0; gfs2_old_multihost_formats[x]; x++)
+ if (gfs2_old_multihost_formats[x] ==
+ sb->sb_multihost_format)
+ break;
+
+ if (!gfs2_old_multihost_formats[x]) {
+ printk(KERN_WARNING
+ "GFS2: code version (%u, %u) is incompatible "
+ "with ondisk format (%u, %u)\n",
+ GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
+ sb->sb_fs_format, sb->sb_multihost_format);
+ printk(KERN_WARNING
+ "GFS2: I don't know how to upgrade this FS\n");
+ return -EINVAL;
+ }
+ }
+
+ if (!sdp->sd_args.ar_upgrade) {
+ printk(KERN_WARNING
+ "GFS2: code version (%u, %u) is incompatible "
+ "with ondisk format (%u, %u)\n",
+ GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
+ sb->sb_fs_format, sb->sb_multihost_format);
+ printk(KERN_INFO
+ "GFS2: Use the \"upgrade\" mount option to upgrade "
+ "the FS\n");
+ printk(KERN_INFO "GFS2: See the manual for more details\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void end_bio_io_page(struct bio *bio, int error)
+{
+ struct page *page = bio->bi_private;
- if (sb->s_flags & (MS_NOATIME | MS_NODIRATIME))
- set_bit(noatime, &sdp->sd_flags);
+ if (!error)
+ SetPageUptodate(page);
+ else
+ printk(KERN_WARNING "gfs2: error %d reading superblock\n", error);
+ unlock_page(page);
+}
+
+static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf)
+{
+ const struct gfs2_sb *str = buf;
+
+ sb->sb_magic = be32_to_cpu(str->sb_header.mh_magic);
+ sb->sb_type = be32_to_cpu(str->sb_header.mh_type);
+ sb->sb_format = be32_to_cpu(str->sb_header.mh_format);
+ sb->sb_fs_format = be32_to_cpu(str->sb_fs_format);
+ sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format);
+ sb->sb_bsize = be32_to_cpu(str->sb_bsize);
+ sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift);
+ sb->sb_master_dir.no_addr = be64_to_cpu(str->sb_master_dir.no_addr);
+ sb->sb_master_dir.no_formal_ino = be64_to_cpu(str->sb_master_dir.no_formal_ino);
+ sb->sb_root_dir.no_addr = be64_to_cpu(str->sb_root_dir.no_addr);
+ sb->sb_root_dir.no_formal_ino = be64_to_cpu(str->sb_root_dir.no_formal_ino);
+
+ memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
+ memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
+}
+
+/**
+ * gfs2_read_super - Read the gfs2 super block from disk
+ * @sdp: The GFS2 super block
+ * @sector: The location of the super block
+ * @error: The error code to return
+ *
+ * This uses the bio functions to read the super block from disk
+ * because we want to be 100% sure that we never read cached data.
+ * A super block is read twice only during each GFS2 mount and is
+ * never written to by the filesystem. The first time its read no
+ * locks are held, and the only details which are looked at are those
+ * relating to the locking protocol. Once locking is up and working,
+ * the sb is read again under the lock to establish the location of
+ * the master directory (contains pointers to journals etc) and the
+ * root directory.
+ *
+ * Returns: 0 on success or error
+ */
+
+static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
+{
+ struct super_block *sb = sdp->sd_vfs;
+ struct gfs2_sb *p;
+ struct page *page;
+ struct bio *bio;
+
+ page = alloc_page(GFP_NOFS);
+ if (unlikely(!page))
+ return -ENOBUFS;
+
+ ClearPageUptodate(page);
+ ClearPageDirty(page);
+ lock_page(page);
+
+ bio = bio_alloc(GFP_NOFS, 1);
+ if (unlikely(!bio)) {
+ __free_page(page);
+ return -ENOBUFS;
+ }
- /* Don't let the VFS update atimes. GFS2 handles this itself. */
- sb->s_flags |= MS_NOATIME | MS_NODIRATIME;
+ bio->bi_sector = sector * (sb->s_blocksize >> 9);
+ bio->bi_bdev = sb->s_bdev;
+ bio_add_page(bio, page, PAGE_SIZE, 0);
+
+ bio->bi_end_io = end_bio_io_page;
+ bio->bi_private = page;
+ submit_bio(READ_SYNC | (1 << BIO_RW_META), bio);
+ wait_on_page_locked(page);
+ bio_put(bio);
+ if (!PageUptodate(page)) {
+ __free_page(page);
+ return -EIO;
+ }
+ p = kmap(page);
+ gfs2_sb_in(&sdp->sd_sb, p);
+ kunmap(page);
+ __free_page(page);
+ return 0;
+}
+/**
+ * gfs2_read_sb - Read super block
+ * @sdp: The GFS2 superblock
+ * @gl: the glock for the superblock (assumed to be held)
+ * @silent: Don't print message if mount fails
+ *
+ */
+
+static int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
+{
+ u32 hash_blocks, ind_blocks, leaf_blocks;
+ u32 tmp_blocks;
+ unsigned int x;
+ int error;
+
+ error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
+ if (error) {
+ if (!silent)
+ fs_err(sdp, "can't read superblock\n");
+ return error;
+ }
+
+ error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
+ if (error)
+ return error;
+
+ sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
+ GFS2_BASIC_BLOCK_SHIFT;
+ sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
+ sdp->sd_diptrs = (sdp->sd_sb.sb_bsize -
+ sizeof(struct gfs2_dinode)) / sizeof(u64);
+ sdp->sd_inptrs = (sdp->sd_sb.sb_bsize -
+ sizeof(struct gfs2_meta_header)) / sizeof(u64);
+ sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header);
+ sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2;
+ sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1;
+ sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(u64);
+ sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize -
+ sizeof(struct gfs2_meta_header)) /
+ sizeof(struct gfs2_quota_change);
+
+ /* Compute maximum reservation required to add a entry to a directory */
+
+ hash_blocks = DIV_ROUND_UP(sizeof(u64) * (1 << GFS2_DIR_MAX_DEPTH),
+ sdp->sd_jbsize);
+
+ ind_blocks = 0;
+ for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) {
+ tmp_blocks = DIV_ROUND_UP(tmp_blocks, sdp->sd_inptrs);
+ ind_blocks += tmp_blocks;
+ }
+
+ leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH;
+
+ sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks;
+
+ sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize -
+ sizeof(struct gfs2_dinode);
+ sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs;
+ for (x = 2;; x++) {
+ u64 space, d;
+ u32 m;
+
+ space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs;
+ d = space;
+ m = do_div(d, sdp->sd_inptrs);
+
+ if (d != sdp->sd_heightsize[x - 1] || m)
+ break;
+ sdp->sd_heightsize[x] = space;
+ }
+ sdp->sd_max_height = x;
+ sdp->sd_heightsize[x] = ~0;
+ gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT);
+
+ sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize -
+ sizeof(struct gfs2_dinode);
+ sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs;
+ for (x = 2;; x++) {
+ u64 space, d;
+ u32 m;
+
+ space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs;
+ d = space;
+ m = do_div(d, sdp->sd_inptrs);
+
+ if (d != sdp->sd_jheightsize[x - 1] || m)
+ break;
+ sdp->sd_jheightsize[x] = space;
+ }
+ sdp->sd_max_jheight = x;
+ sdp->sd_jheightsize[x] = ~0;
+ gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
+
+ return 0;
}
static int init_names(struct gfs2_sbd *sdp, int silent)
@@ -225,51 +512,59 @@ fail:
return error;
}
-static inline struct inode *gfs2_lookup_root(struct super_block *sb,
- u64 no_addr)
+static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr,
+ u64 no_addr, const char *name)
{
- return gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0);
+ struct gfs2_sbd *sdp = sb->s_fs_info;
+ struct dentry *dentry;
+ struct inode *inode;
+
+ inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0);
+ if (IS_ERR(inode)) {
+ fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode));
+ return PTR_ERR(inode);
+ }
+ dentry = d_alloc_root(inode);
+ if (!dentry) {
+ fs_err(sdp, "can't alloc %s dentry\n", name);
+ iput(inode);
+ return -ENOMEM;
+ }
+ dentry->d_op = &gfs2_dops;
+ *dptr = dentry;
+ return 0;
}
-static int init_sb(struct gfs2_sbd *sdp, int silent, int undo)
+static int init_sb(struct gfs2_sbd *sdp, int silent)
{
struct super_block *sb = sdp->sd_vfs;
struct gfs2_holder sb_gh;
u64 no_addr;
- struct inode *inode;
- int error = 0;
+ int ret;
- if (undo) {
- if (sb->s_root) {
- dput(sb->s_root);
- sb->s_root = NULL;
- }
- return 0;
+ ret = gfs2_glock_nq_num(sdp, GFS2_SB_LOCK, &gfs2_meta_glops,
+ LM_ST_SHARED, 0, &sb_gh);
+ if (ret) {
+ fs_err(sdp, "can't acquire superblock glock: %d\n", ret);
+ return ret;
}
- error = gfs2_glock_nq_num(sdp, GFS2_SB_LOCK, &gfs2_meta_glops,
- LM_ST_SHARED, 0, &sb_gh);
- if (error) {
- fs_err(sdp, "can't acquire superblock glock: %d\n", error);
- return error;
- }
-
- error = gfs2_read_sb(sdp, sb_gh.gh_gl, silent);
- if (error) {
- fs_err(sdp, "can't read superblock: %d\n", error);
+ ret = gfs2_read_sb(sdp, sb_gh.gh_gl, silent);
+ if (ret) {
+ fs_err(sdp, "can't read superblock: %d\n", ret);
goto out;
}
/* Set up the buffer cache and SB for real */
if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) {
- error = -EINVAL;
+ ret = -EINVAL;
fs_err(sdp, "FS block size (%u) is too small for device "
"block size (%u)\n",
sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev));
goto out;
}
if (sdp->sd_sb.sb_bsize > PAGE_SIZE) {
- error = -EINVAL;
+ ret = -EINVAL;
fs_err(sdp, "FS block size (%u) is too big for machine "
"page size (%u)\n",
sdp->sd_sb.sb_bsize, (unsigned int)PAGE_SIZE);
@@ -279,26 +574,21 @@ static int init_sb(struct gfs2_sbd *sdp, int silent, int undo)
/* Get the root inode */
no_addr = sdp->sd_sb.sb_root_dir.no_addr;
- if (sb->s_type == &gfs2meta_fs_type)
- no_addr = sdp->sd_sb.sb_master_dir.no_addr;
- inode = gfs2_lookup_root(sb, no_addr);
- if (IS_ERR(inode)) {
- error = PTR_ERR(inode);
- fs_err(sdp, "can't read in root inode: %d\n", error);
+ ret = gfs2_lookup_root(sb, &sdp->sd_root_dir, no_addr, "root");
+ if (ret)
goto out;
- }
- sb->s_root = d_alloc_root(inode);
- if (!sb->s_root) {
- fs_err(sdp, "can't get root dentry\n");
- error = -ENOMEM;
- iput(inode);
- } else
- sb->s_root->d_op = &gfs2_dops;
-
+ /* Get the master inode */
+ no_addr = sdp->sd_sb.sb_master_dir.no_addr;
+ ret = gfs2_lookup_root(sb, &sdp->sd_master_dir, no_addr, "master");
+ if (ret) {
+ dput(sdp->sd_root_dir);
+ goto out;
+ }
+ sb->s_root = dget(sdp->sd_args.ar_meta ? sdp->sd_master_dir : sdp->sd_root_dir);
out:
gfs2_glock_dq_uninit(&sb_gh);
- return error;
+ return ret;
}
/**
@@ -364,6 +654,8 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
{
+ if (!sdp->sd_lockstruct.ls_ops->lm_others_may_mount)
+ return;
if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
sdp->sd_lockstruct.ls_ops->lm_others_may_mount(
sdp->sd_lockstruct.ls_lockspace);
@@ -371,6 +663,7 @@ static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
static int init_journal(struct gfs2_sbd *sdp, int undo)
{
+ struct inode *master = sdp->sd_master_dir->d_inode;
struct gfs2_holder ji_gh;
struct task_struct *p;
struct gfs2_inode *ip;
@@ -382,7 +675,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
goto fail_recoverd;
}
- sdp->sd_jindex = gfs2_lookup_simple(sdp->sd_master_dir, "jindex");
+ sdp->sd_jindex = gfs2_lookup_simple(master, "jindex");
if (IS_ERR(sdp->sd_jindex)) {
fs_err(sdp, "can't lookup journal index: %d\n", error);
return PTR_ERR(sdp->sd_jindex);
@@ -505,25 +798,17 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
{
int error = 0;
struct gfs2_inode *ip;
- struct inode *inode;
+ struct inode *master = sdp->sd_master_dir->d_inode;
if (undo)
goto fail_qinode;
- inode = gfs2_lookup_root(sdp->sd_vfs, sdp->sd_sb.sb_master_dir.no_addr);
- if (IS_ERR(inode)) {
- error = PTR_ERR(inode);
- fs_err(sdp, "can't read in master directory: %d\n", error);
- goto fail;
- }
- sdp->sd_master_dir = inode;
-
error = init_journal(sdp, undo);
if (error)
- goto fail_master;
+ goto fail;
/* Read in the master inode number inode */
- sdp->sd_inum_inode = gfs2_lookup_simple(sdp->sd_master_dir, "inum");
+ sdp->sd_inum_inode = gfs2_lookup_simple(master, "inum");
if (IS_ERR(sdp->sd_inum_inode)) {
error = PTR_ERR(sdp->sd_inum_inode);
fs_err(sdp, "can't read in inum inode: %d\n", error);
@@ -532,7 +817,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
/* Read in the master statfs inode */
- sdp->sd_statfs_inode = gfs2_lookup_simple(sdp->sd_master_dir, "statfs");
+ sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs");
if (IS_ERR(sdp->sd_statfs_inode)) {
error = PTR_ERR(sdp->sd_statfs_inode);
fs_err(sdp, "can't read in statfs inode: %d\n", error);
@@ -540,7 +825,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
}
/* Read in the resource index inode */
- sdp->sd_rindex = gfs2_lookup_simple(sdp->sd_master_dir, "rindex");
+ sdp->sd_rindex = gfs2_lookup_simple(master, "rindex");
if (IS_ERR(sdp->sd_rindex)) {
error = PTR_ERR(sdp->sd_rindex);
fs_err(sdp, "can't get resource index inode: %d\n", error);
@@ -551,7 +836,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
sdp->sd_rindex_uptodate = 0;
/* Read in the quota inode */
- sdp->sd_quota_inode = gfs2_lookup_simple(sdp->sd_master_dir, "quota");
+ sdp->sd_quota_inode = gfs2_lookup_simple(master, "quota");
if (IS_ERR(sdp->sd_quota_inode)) {
error = PTR_ERR(sdp->sd_quota_inode);
fs_err(sdp, "can't get quota file inode: %d\n", error);
@@ -570,8 +855,6 @@ fail_inum:
iput(sdp->sd_inum_inode);
fail_journal:
init_journal(sdp, UNDO);
-fail_master:
- iput(sdp->sd_master_dir);
fail:
return error;
}
@@ -582,6 +865,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
char buf[30];
int error = 0;
struct gfs2_inode *ip;
+ struct inode *master = sdp->sd_master_dir->d_inode;
if (sdp->sd_args.ar_spectator)
return 0;
@@ -589,7 +873,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
if (undo)
goto fail_qc_gh;
- pn = gfs2_lookup_simple(sdp->sd_master_dir, "per_node");
+ pn = gfs2_lookup_simple(master, "per_node");
if (IS_ERR(pn)) {
error = PTR_ERR(pn);
fs_err(sdp, "can't find per_node directory: %d\n", error);
@@ -741,8 +1025,7 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
goto out;
}
- if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lockspace) ||
- gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
+ if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >=
GFS2_MIN_LVB_SIZE)) {
gfs2_unmount_lockproto(&sdp->sd_lockstruct);
@@ -800,7 +1083,11 @@ static int fill_super(struct super_block *sb, void *data, int silent)
goto fail;
}
- init_vfs(sb, SDF_NOATIME);
+ sb->s_magic = GFS2_MAGIC;
+ sb->s_op = &gfs2_super_ops;
+ sb->s_export_op = &gfs2_export_ops;
+ sb->s_time_gran = 1;
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
/* Set up the buffer cache and fill in some fake block size values
to allow us to read-in the on-disk superblock. */
@@ -828,7 +1115,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
if (error)
goto fail_lm;
- error = init_sb(sdp, silent, DO);
+ error = init_sb(sdp, silent);
if (error)
goto fail_locking;
@@ -869,11 +1156,15 @@ fail_per_node:
fail_inodes:
init_inodes(sdp, UNDO);
fail_sb:
- init_sb(sdp, 0, UNDO);
+ if (sdp->sd_root_dir)
+ dput(sdp->sd_root_dir);
+ if (sdp->sd_master_dir)
+ dput(sdp->sd_master_dir);
+ sb->s_root = NULL;
fail_locking:
init_locking(sdp, &mount_gh, UNDO);
fail_lm:
- gfs2_gl_hash_clear(sdp, WAIT);
+ gfs2_gl_hash_clear(sdp);
gfs2_lm_unmount(sdp);
while (invalidate_inodes(sb))
yield();
@@ -887,151 +1178,63 @@ fail:
}
static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
- const char *dev_name, void *data, struct vfsmount *mnt)
+ const char *dev_name, void *data, struct vfsmount *mnt)
{
- struct super_block *sb;
- struct gfs2_sbd *sdp;
- int error = get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
- if (error)
- goto out;
- sb = mnt->mnt_sb;
- sdp = sb->s_fs_info;
- sdp->sd_gfs2mnt = mnt;
-out:
- return error;
+ return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
}
-static int fill_super_meta(struct super_block *sb, struct super_block *new,
- void *data, int silent)
+static struct super_block *get_gfs2_sb(const char *dev_name)
{
- struct gfs2_sbd *sdp = sb->s_fs_info;
- struct inode *inode;
- int error = 0;
-
- new->s_fs_info = sdp;
- sdp->sd_vfs_meta = sb;
-
- init_vfs(new, SDF_NOATIME);
-
- /* Get the master inode */
- inode = igrab(sdp->sd_master_dir);
-
- new->s_root = d_alloc_root(inode);
- if (!new->s_root) {
- fs_err(sdp, "can't get root dentry\n");
- error = -ENOMEM;
- iput(inode);
- } else
- new->s_root->d_op = &gfs2_dops;
-
- return error;
-}
-
-static int set_bdev_super(struct super_block *s, void *data)
-{
- s->s_bdev = data;
- s->s_dev = s->s_bdev->bd_dev;
- return 0;
-}
-
-static int test_bdev_super(struct super_block *s, void *data)
-{
- return s->s_bdev == data;
-}
-
-static struct super_block* get_gfs2_sb(const char *dev_name)
-{
- struct kstat stat;
+ struct super_block *sb;
struct nameidata nd;
- struct super_block *sb = NULL, *s;
int error;
error = path_lookup(dev_name, LOOKUP_FOLLOW, &nd);
if (error) {
- printk(KERN_WARNING "GFS2: path_lookup on %s returned error\n",
- dev_name);
- goto out;
- }
- error = vfs_getattr(nd.path.mnt, nd.path.dentry, &stat);
-
- list_for_each_entry(s, &gfs2_fs_type.fs_supers, s_instances) {
- if ((S_ISBLK(stat.mode) && s->s_dev == stat.rdev) ||
- (S_ISDIR(stat.mode) &&
- s == nd.path.dentry->d_inode->i_sb)) {
- sb = s;
- goto free_nd;
- }
+ printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n",
+ dev_name, error);
+ return NULL;
}
-
- printk(KERN_WARNING "GFS2: Unrecognized block device or "
- "mount point %s\n", dev_name);
-
-free_nd:
+ sb = nd.path.dentry->d_inode->i_sb;
+ if (sb && (sb->s_type == &gfs2_fs_type))
+ atomic_inc(&sb->s_active);
+ else
+ sb = NULL;
path_put(&nd.path);
-out:
return sb;
}
static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
const char *dev_name, void *data, struct vfsmount *mnt)
{
- int error = 0;
- struct super_block *sb = NULL, *new;
+ struct super_block *sb = NULL;
struct gfs2_sbd *sdp;
sb = get_gfs2_sb(dev_name);
if (!sb) {
printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
- error = -ENOENT;
- goto error;
+ return -ENOENT;
}
sdp = sb->s_fs_info;
- if (sdp->sd_vfs_meta) {
- printk(KERN_WARNING "GFS2: gfs2meta mount already exists\n");
- error = -EBUSY;
- goto error;
- }
- down(&sb->s_bdev->bd_mount_sem);
- new = sget(fs_type, test_bdev_super, set_bdev_super, sb->s_bdev);
- up(&sb->s_bdev->bd_mount_sem);
- if (IS_ERR(new)) {
- error = PTR_ERR(new);
- goto error;
- }
- new->s_flags = flags;
- strlcpy(new->s_id, sb->s_id, sizeof(new->s_id));
- sb_set_blocksize(new, sb->s_blocksize);
- error = fill_super_meta(sb, new, data, flags & MS_SILENT ? 1 : 0);
- if (error) {
- up_write(&new->s_umount);
- deactivate_super(new);
- goto error;
- }
-
- new->s_flags |= MS_ACTIVE;
-
- /* Grab a reference to the gfs2 mount point */
- atomic_inc(&sdp->sd_gfs2mnt->mnt_count);
- return simple_set_mnt(mnt, new);
-error:
- return error;
+ mnt->mnt_sb = sb;
+ mnt->mnt_root = dget(sdp->sd_master_dir);
+ return 0;
}
static void gfs2_kill_sb(struct super_block *sb)
{
- if (sb->s_fs_info) {
- gfs2_delete_debugfs_file(sb->s_fs_info);
- gfs2_meta_syncfs(sb->s_fs_info);
+ struct gfs2_sbd *sdp = sb->s_fs_info;
+ if (sdp) {
+ gfs2_meta_syncfs(sdp);
+ dput(sdp->sd_root_dir);
+ dput(sdp->sd_master_dir);
+ sdp->sd_root_dir = NULL;
+ sdp->sd_master_dir = NULL;
}
+ shrink_dcache_sb(sb);
kill_block_super(sb);
-}
-
-static void gfs2_kill_sb_meta(struct super_block *sb)
-{
- struct gfs2_sbd *sdp = sb->s_fs_info;
- generic_shutdown_super(sb);
- sdp->sd_vfs_meta = NULL;
- atomic_dec(&sdp->sd_gfs2mnt->mnt_count);
+ if (sdp)
+ gfs2_delete_debugfs_file(sdp);
}
struct file_system_type gfs2_fs_type = {
@@ -1046,7 +1249,6 @@ struct file_system_type gfs2meta_fs_type = {
.name = "gfs2meta",
.fs_flags = FS_REQUIRES_DEV,
.get_sb = gfs2_get_sb_meta,
- .kill_sb = gfs2_kill_sb_meta,
.owner = THIS_MODULE,
};
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 2686ad4c002..534e1e2c65c 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -74,7 +74,7 @@ static int gfs2_create(struct inode *dir, struct dentry *dentry,
return PTR_ERR(inode);
}
- inode = gfs2_lookupi(dir, &dentry->d_name, 0, nd);
+ inode = gfs2_lookupi(dir, &dentry->d_name, 0);
if (inode) {
if (!IS_ERR(inode)) {
gfs2_holder_uninit(ghs);
@@ -109,7 +109,7 @@ static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
dentry->d_op = &gfs2_dops;
- inode = gfs2_lookupi(dir, &dentry->d_name, 0, nd);
+ inode = gfs2_lookupi(dir, &dentry->d_name, 0);
if (inode && IS_ERR(inode))
return ERR_CAST(inode);
@@ -159,11 +159,15 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1);
- error = gfs2_glock_nq_m(2, ghs);
+ error = gfs2_glock_nq(ghs); /* parent */
if (error)
- goto out;
+ goto out_parent;
+
+ error = gfs2_glock_nq(ghs + 1); /* child */
+ if (error)
+ goto out_child;
- error = permission(dir, MAY_WRITE | MAY_EXEC, NULL);
+ error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC);
if (error)
goto out_gunlock;
@@ -245,8 +249,10 @@ out_alloc:
if (alloc_required)
gfs2_alloc_put(dip);
out_gunlock:
- gfs2_glock_dq_m(2, ghs);
-out:
+ gfs2_glock_dq(ghs + 1);
+out_child:
+ gfs2_glock_dq(ghs);
+out_parent:
gfs2_holder_uninit(ghs);
gfs2_holder_uninit(ghs + 1);
if (!error) {
@@ -302,7 +308,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
if (error)
- goto out_rgrp;
+ goto out_gunlock;
error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0);
if (error)
@@ -316,6 +322,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
out_end_trans:
gfs2_trans_end(sdp);
+out_gunlock:
gfs2_glock_dq(ghs + 2);
out_rgrp:
gfs2_holder_uninit(ghs + 2);
@@ -485,7 +492,6 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
struct gfs2_holder ri_gh;
int error;
-
error = gfs2_rindex_hold(sdp, &ri_gh);
if (error)
return error;
@@ -495,9 +501,17 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr);
gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2);
- error = gfs2_glock_nq_m(3, ghs);
+ error = gfs2_glock_nq(ghs); /* parent */
if (error)
- goto out;
+ goto out_parent;
+
+ error = gfs2_glock_nq(ghs + 1); /* child */
+ if (error)
+ goto out_child;
+
+ error = gfs2_glock_nq(ghs + 2); /* rgrp */
+ if (error)
+ goto out_rgrp;
error = gfs2_unlink_ok(dip, &dentry->d_name, ip);
if (error)
@@ -523,11 +537,15 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry)
gfs2_trans_end(sdp);
out_gunlock:
- gfs2_glock_dq_m(3, ghs);
-out:
- gfs2_holder_uninit(ghs);
- gfs2_holder_uninit(ghs + 1);
+ gfs2_glock_dq(ghs + 2);
+out_rgrp:
gfs2_holder_uninit(ghs + 2);
+ gfs2_glock_dq(ghs + 1);
+out_child:
+ gfs2_holder_uninit(ghs + 1);
+ gfs2_glock_dq(ghs);
+out_parent:
+ gfs2_holder_uninit(ghs);
gfs2_glock_dq_uninit(&ri_gh);
return error;
}
@@ -571,6 +589,54 @@ static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode,
return 0;
}
+/*
+ * gfs2_ok_to_move - check if it's ok to move a directory to another directory
+ * @this: move this
+ * @to: to here
+ *
+ * Follow @to back to the root and make sure we don't encounter @this
+ * Assumes we already hold the rename lock.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
+{
+ struct inode *dir = &to->i_inode;
+ struct super_block *sb = dir->i_sb;
+ struct inode *tmp;
+ struct qstr dotdot;
+ int error = 0;
+
+ gfs2_str2qstr(&dotdot, "..");
+
+ igrab(dir);
+
+ for (;;) {
+ if (dir == &this->i_inode) {
+ error = -EINVAL;
+ break;
+ }
+ if (dir == sb->s_root->d_inode) {
+ error = 0;
+ break;
+ }
+
+ tmp = gfs2_lookupi(dir, &dotdot, 1);
+ if (IS_ERR(tmp)) {
+ error = PTR_ERR(tmp);
+ break;
+ }
+
+ iput(dir);
+ dir = tmp;
+ }
+
+ iput(dir);
+
+ return error;
+}
+
/**
* gfs2_rename - Rename a file
* @odir: Parent directory of old file name
@@ -589,7 +655,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
struct gfs2_inode *nip = NULL;
struct gfs2_sbd *sdp = GFS2_SB(odir);
- struct gfs2_holder ghs[5], r_gh;
+ struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, };
struct gfs2_rgrpd *nrgd;
unsigned int num_gh;
int dir_rename = 0;
@@ -603,19 +669,20 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
return 0;
}
- /* Make sure we aren't trying to move a dirctory into it's subdir */
- if (S_ISDIR(ip->i_inode.i_mode) && odip != ndip) {
- dir_rename = 1;
-
- error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE, 0,
- &r_gh);
+ if (odip != ndip) {
+ error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE,
+ 0, &r_gh);
if (error)
goto out;
- error = gfs2_ok_to_move(ip, ndip);
- if (error)
- goto out_gunlock_r;
+ if (S_ISDIR(ip->i_inode.i_mode)) {
+ dir_rename = 1;
+ /* don't move a dirctory into it's subdir */
+ error = gfs2_ok_to_move(ip, ndip);
+ if (error)
+ goto out_gunlock_r;
+ }
}
num_gh = 1;
@@ -639,9 +706,11 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++);
}
- error = gfs2_glock_nq_m(num_gh, ghs);
- if (error)
- goto out_uninit;
+ for (x = 0; x < num_gh; x++) {
+ error = gfs2_glock_nq(ghs + x);
+ if (error)
+ goto out_gunlock;
+ }
/* Check out the old directory */
@@ -669,7 +738,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
}
}
} else {
- error = permission(ndir, MAY_WRITE | MAY_EXEC, NULL);
+ error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC);
if (error)
goto out_gunlock;
@@ -704,7 +773,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
/* Check out the dir to be renamed */
if (dir_rename) {
- error = permission(odentry->d_inode, MAY_WRITE, NULL);
+ error = gfs2_permission(odentry->d_inode, MAY_WRITE);
if (error)
goto out_gunlock;
}
@@ -804,12 +873,12 @@ out_alloc:
if (alloc_required)
gfs2_alloc_put(ndip);
out_gunlock:
- gfs2_glock_dq_m(num_gh, ghs);
-out_uninit:
- for (x = 0; x < num_gh; x++)
+ while (x--) {
+ gfs2_glock_dq(ghs + x);
gfs2_holder_uninit(ghs + x);
+ }
out_gunlock_r:
- if (dir_rename)
+ if (r_gh.gh_gl)
gfs2_glock_dq_uninit(&r_gh);
out:
return error;
@@ -891,7 +960,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
* Returns: errno
*/
-static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
+int gfs2_permission(struct inode *inode, int mask)
{
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_holder i_gh;
@@ -905,7 +974,10 @@ static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
unlock = 1;
}
- error = generic_permission(inode, mask, gfs2_check_acl);
+ if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
+ error = -EACCES;
+ else
+ error = generic_permission(inode, mask, gfs2_check_acl);
if (unlock)
gfs2_glock_dq_uninit(&i_gh);
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 0b7cc920eb8..d5355d9b592 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -20,6 +20,7 @@
#include <linux/gfs2_ondisk.h>
#include <linux/crc32.h>
#include <linux/lm_interface.h>
+#include <linux/time.h>
#include "gfs2.h"
#include "incore.h"
@@ -38,6 +39,7 @@
#include "dir.h"
#include "eattr.h"
#include "bmap.h"
+#include "meta_io.h"
/**
* gfs2_write_inode - Make sure the inode is stable on the disk
@@ -50,16 +52,74 @@
static int gfs2_write_inode(struct inode *inode, int sync)
{
struct gfs2_inode *ip = GFS2_I(inode);
-
- /* Check this is a "normal" inode */
- if (test_bit(GIF_USER, &ip->i_flags)) {
- if (current->flags & PF_MEMALLOC)
- return 0;
- if (sync)
- gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ struct gfs2_holder gh;
+ struct buffer_head *bh;
+ struct timespec atime;
+ struct gfs2_dinode *di;
+ int ret = 0;
+
+ /* Check this is a "normal" inode, etc */
+ if (!test_bit(GIF_USER, &ip->i_flags) ||
+ (current->flags & PF_MEMALLOC))
+ return 0;
+ ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+ if (ret)
+ goto do_flush;
+ ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
+ if (ret)
+ goto do_unlock;
+ ret = gfs2_meta_inode_buffer(ip, &bh);
+ if (ret == 0) {
+ di = (struct gfs2_dinode *)bh->b_data;
+ atime.tv_sec = be64_to_cpu(di->di_atime);
+ atime.tv_nsec = be32_to_cpu(di->di_atime_nsec);
+ if (timespec_compare(&inode->i_atime, &atime) > 0) {
+ gfs2_trans_add_bh(ip->i_gl, bh, 1);
+ gfs2_dinode_out(ip, bh->b_data);
+ }
+ brelse(bh);
}
+ gfs2_trans_end(sdp);
+do_unlock:
+ gfs2_glock_dq_uninit(&gh);
+do_flush:
+ if (sync != 0)
+ gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
+ return ret;
+}
- return 0;
+/**
+ * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
+ * @sdp: the filesystem
+ *
+ * Returns: errno
+ */
+
+static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
+{
+ struct gfs2_holder t_gh;
+ int error;
+
+ gfs2_quota_sync(sdp);
+ gfs2_statfs_sync(sdp);
+
+ error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
+ &t_gh);
+ if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+ return error;
+
+ gfs2_meta_syncfs(sdp);
+ gfs2_log_shutdown(sdp);
+
+ clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
+
+ if (t_gh.gh_gl)
+ gfs2_glock_dq_uninit(&t_gh);
+
+ gfs2_quota_cleanup(sdp);
+
+ return error;
}
/**
@@ -73,12 +133,6 @@ static void gfs2_put_super(struct super_block *sb)
struct gfs2_sbd *sdp = sb->s_fs_info;
int error;
- if (!sdp)
- return;
-
- if (!strncmp(sb->s_type->name, "gfs2meta", 8))
- return; /* Nothing to do */
-
/* Unfreeze the filesystem, if we need to */
mutex_lock(&sdp->sd_freeze_lock);
@@ -101,7 +155,6 @@ static void gfs2_put_super(struct super_block *sb)
/* Release stuff */
- iput(sdp->sd_master_dir);
iput(sdp->sd_jindex);
iput(sdp->sd_inum_inode);
iput(sdp->sd_statfs_inode);
@@ -126,7 +179,7 @@ static void gfs2_put_super(struct super_block *sb)
gfs2_clear_rgrpd(sdp);
gfs2_jindex_free(sdp);
/* Take apart glock structures and buffer lists */
- gfs2_gl_hash_clear(sdp, WAIT);
+ gfs2_gl_hash_clear(sdp);
/* Unmount the locking protocol */
gfs2_lm_unmount(sdp);
@@ -152,10 +205,11 @@ static void gfs2_write_super(struct super_block *sb)
*
* Flushes the log to disk.
*/
+
static int gfs2_sync_fs(struct super_block *sb, int wait)
{
sb->s_dirt = 0;
- if (wait)
+ if (wait && sb->s_fs_info)
gfs2_log_flush(sb->s_fs_info, NULL);
return 0;
}
@@ -270,14 +324,6 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
}
}
- if (*flags & (MS_NOATIME | MS_NODIRATIME))
- set_bit(SDF_NOATIME, &sdp->sd_flags);
- else
- clear_bit(SDF_NOATIME, &sdp->sd_flags);
-
- /* Don't let the VFS update atimes. GFS2 handles this itself. */
- *flags |= MS_NOATIME | MS_NODIRATIME;
-
return error;
}
@@ -295,6 +341,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
* inode's blocks, or alternatively pass the baton on to another
* node for later deallocation.
*/
+
static void gfs2_drop_inode(struct inode *inode)
{
struct gfs2_inode *ip = GFS2_I(inode);
@@ -333,6 +380,16 @@ static void gfs2_clear_inode(struct inode *inode)
}
}
+static int is_ancestor(const struct dentry *d1, const struct dentry *d2)
+{
+ do {
+ if (d1 == d2)
+ return 1;
+ d1 = d1->d_parent;
+ } while (!IS_ROOT(d1));
+ return 0;
+}
+
/**
* gfs2_show_options - Show mount options for /proc/mounts
* @s: seq_file structure
@@ -346,6 +403,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
struct gfs2_args *args = &sdp->sd_args;
+ if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir))
+ seq_printf(s, ",meta");
if (args->ar_lockproto[0])
seq_printf(s, ",lockproto=%s", args->ar_lockproto);
if (args->ar_locktable[0])
@@ -414,6 +473,7 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
* conversion on the iopen lock, but we can change that later. This
* is safe, just less efficient.
*/
+
static void gfs2_delete_inode(struct inode *inode)
{
struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
@@ -478,8 +538,6 @@ out:
clear_inode(inode);
}
-
-
static struct inode *gfs2_alloc_inode(struct super_block *sb)
{
struct gfs2_inode *ip;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 56aaf915c59..3e073f5144f 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -904,7 +904,7 @@ static int need_sync(struct gfs2_quota_data *qd)
do_sync = 0;
else {
value *= gfs2_jindex_size(sdp) * num;
- do_div(value, den);
+ value = div_s64(value, den);
value += (s64)be64_to_cpu(qd->qd_qb.qb_value);
if (value < (s64)be64_to_cpu(qd->qd_qb.qb_limit))
do_sync = 0;
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 2888e4b4b1c..d5e91f4f6a0 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -428,6 +428,9 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea
static void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
unsigned int message)
{
+ if (!sdp->sd_lockstruct.ls_ops->lm_recovery_done)
+ return;
+
if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
sdp->sd_lockstruct.ls_ops->lm_recovery_done(
sdp->sd_lockstruct.ls_lockspace, jid, message);
@@ -505,7 +508,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd)
error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
LM_FLAG_NOEXP | LM_FLAG_PRIORITY |
- GL_NOCANCEL | GL_NOCACHE, &t_gh);
+ GL_NOCACHE, &t_gh);
if (error)
goto fail_gunlock_ji;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 3401628d742..2d90fb25350 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -371,11 +371,6 @@ static void clear_rgrpdi(struct gfs2_sbd *sdp)
spin_lock(&sdp->sd_rindex_spin);
sdp->sd_rindex_forward = NULL;
- head = &sdp->sd_rindex_recent_list;
- while (!list_empty(head)) {
- rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
- list_del(&rgd->rd_recent);
- }
spin_unlock(&sdp->sd_rindex_spin);
head = &sdp->sd_rindex_list;
@@ -945,107 +940,30 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
}
/**
- * recent_rgrp_first - get first RG from "recent" list
- * @sdp: The GFS2 superblock
- * @rglast: address of the rgrp used last
- *
- * Returns: The first rgrp in the recent list
- */
-
-static struct gfs2_rgrpd *recent_rgrp_first(struct gfs2_sbd *sdp,
- u64 rglast)
-{
- struct gfs2_rgrpd *rgd;
-
- spin_lock(&sdp->sd_rindex_spin);
-
- if (rglast) {
- list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
- if (rgrp_contains_block(rgd, rglast))
- goto out;
- }
- }
- rgd = NULL;
- if (!list_empty(&sdp->sd_rindex_recent_list))
- rgd = list_entry(sdp->sd_rindex_recent_list.next,
- struct gfs2_rgrpd, rd_recent);
-out:
- spin_unlock(&sdp->sd_rindex_spin);
- return rgd;
-}
-
-/**
* recent_rgrp_next - get next RG from "recent" list
* @cur_rgd: current rgrp
- * @remove:
*
* Returns: The next rgrp in the recent list
*/
-static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd,
- int remove)
+static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd)
{
struct gfs2_sbd *sdp = cur_rgd->rd_sbd;
struct list_head *head;
struct gfs2_rgrpd *rgd;
spin_lock(&sdp->sd_rindex_spin);
-
- head = &sdp->sd_rindex_recent_list;
-
- list_for_each_entry(rgd, head, rd_recent) {
- if (rgd == cur_rgd) {
- if (cur_rgd->rd_recent.next != head)
- rgd = list_entry(cur_rgd->rd_recent.next,
- struct gfs2_rgrpd, rd_recent);
- else
- rgd = NULL;
-
- if (remove)
- list_del(&cur_rgd->rd_recent);
-
- goto out;
- }
+ head = &sdp->sd_rindex_mru_list;
+ if (unlikely(cur_rgd->rd_list_mru.next == head)) {
+ spin_unlock(&sdp->sd_rindex_spin);
+ return NULL;
}
-
- rgd = NULL;
- if (!list_empty(head))
- rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
-
-out:
+ rgd = list_entry(cur_rgd->rd_list_mru.next, struct gfs2_rgrpd, rd_list_mru);
spin_unlock(&sdp->sd_rindex_spin);
return rgd;
}
/**
- * recent_rgrp_add - add an RG to tail of "recent" list
- * @new_rgd: The rgrp to add
- *
- */
-
-static void recent_rgrp_add(struct gfs2_rgrpd *new_rgd)
-{
- struct gfs2_sbd *sdp = new_rgd->rd_sbd;
- struct gfs2_rgrpd *rgd;
- unsigned int count = 0;
- unsigned int max = sdp->sd_rgrps / gfs2_jindex_size(sdp);
-
- spin_lock(&sdp->sd_rindex_spin);
-
- list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
- if (rgd == new_rgd)
- goto out;
-
- if (++count >= max)
- goto out;
- }
- list_add_tail(&new_rgd->rd_recent, &sdp->sd_rindex_recent_list);
-
-out:
- spin_unlock(&sdp->sd_rindex_spin);
-}
-
-/**
* forward_rgrp_get - get an rgrp to try next from full list
* @sdp: The GFS2 superblock
*
@@ -1112,9 +1030,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
int loops = 0;
int error, rg_locked;
- /* Try recently successful rgrps */
-
- rgd = recent_rgrp_first(sdp, ip->i_goal);
+ rgd = gfs2_blk2rgrpd(sdp, ip->i_goal);
while (rgd) {
rg_locked = 0;
@@ -1136,11 +1052,9 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
gfs2_glock_dq_uninit(&al->al_rgd_gh);
if (inode)
return inode;
- rgd = recent_rgrp_next(rgd, 1);
- break;
-
+ /* fall through */
case GLR_TRYFAILED:
- rgd = recent_rgrp_next(rgd, 0);
+ rgd = recent_rgrp_next(rgd);
break;
default:
@@ -1199,7 +1113,9 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
out:
if (begin) {
- recent_rgrp_add(rgd);
+ spin_lock(&sdp->sd_rindex_spin);
+ list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
+ spin_unlock(&sdp->sd_rindex_spin);
rgd = gfs2_rgrpd_get_next(rgd);
if (!rgd)
rgd = gfs2_rgrpd_get_first(sdp);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 7aeacbc65f3..c3ba3d9d0aa 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -33,314 +33,6 @@
#include "trans.h"
#include "util.h"
-static const u32 gfs2_old_fs_formats[] = {
- 0
-};
-
-static const u32 gfs2_old_multihost_formats[] = {
- 0
-};
-
-/**
- * gfs2_tune_init - Fill a gfs2_tune structure with default values
- * @gt: tune
- *
- */
-
-void gfs2_tune_init(struct gfs2_tune *gt)
-{
- spin_lock_init(&gt->gt_spin);
-
- gt->gt_demote_secs = 300;
- gt->gt_incore_log_blocks = 1024;
- gt->gt_log_flush_secs = 60;
- gt->gt_recoverd_secs = 60;
- gt->gt_logd_secs = 1;
- gt->gt_quotad_secs = 5;
- gt->gt_quota_simul_sync = 64;
- gt->gt_quota_warn_period = 10;
- gt->gt_quota_scale_num = 1;
- gt->gt_quota_scale_den = 1;
- gt->gt_quota_cache_secs = 300;
- gt->gt_quota_quantum = 60;
- gt->gt_atime_quantum = 3600;
- gt->gt_new_files_jdata = 0;
- gt->gt_new_files_directio = 0;
- gt->gt_max_readahead = 1 << 18;
- gt->gt_stall_secs = 600;
- gt->gt_complain_secs = 10;
- gt->gt_statfs_quantum = 30;
- gt->gt_statfs_slow = 0;
-}
-
-/**
- * gfs2_check_sb - Check superblock
- * @sdp: the filesystem
- * @sb: The superblock
- * @silent: Don't print a message if the check fails
- *
- * Checks the version code of the FS is one that we understand how to
- * read and that the sizes of the various on-disk structures have not
- * changed.
- */
-
-int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent)
-{
- unsigned int x;
-
- if (sb->sb_magic != GFS2_MAGIC ||
- sb->sb_type != GFS2_METATYPE_SB) {
- if (!silent)
- printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n");
- return -EINVAL;
- }
-
- /* If format numbers match exactly, we're done. */
-
- if (sb->sb_fs_format == GFS2_FORMAT_FS &&
- sb->sb_multihost_format == GFS2_FORMAT_MULTI)
- return 0;
-
- if (sb->sb_fs_format != GFS2_FORMAT_FS) {
- for (x = 0; gfs2_old_fs_formats[x]; x++)
- if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
- break;
-
- if (!gfs2_old_fs_formats[x]) {
- printk(KERN_WARNING
- "GFS2: code version (%u, %u) is incompatible "
- "with ondisk format (%u, %u)\n",
- GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
- sb->sb_fs_format, sb->sb_multihost_format);
- printk(KERN_WARNING
- "GFS2: I don't know how to upgrade this FS\n");
- return -EINVAL;
- }
- }
-
- if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
- for (x = 0; gfs2_old_multihost_formats[x]; x++)
- if (gfs2_old_multihost_formats[x] ==
- sb->sb_multihost_format)
- break;
-
- if (!gfs2_old_multihost_formats[x]) {
- printk(KERN_WARNING
- "GFS2: code version (%u, %u) is incompatible "
- "with ondisk format (%u, %u)\n",
- GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
- sb->sb_fs_format, sb->sb_multihost_format);
- printk(KERN_WARNING
- "GFS2: I don't know how to upgrade this FS\n");
- return -EINVAL;
- }
- }
-
- if (!sdp->sd_args.ar_upgrade) {
- printk(KERN_WARNING
- "GFS2: code version (%u, %u) is incompatible "
- "with ondisk format (%u, %u)\n",
- GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
- sb->sb_fs_format, sb->sb_multihost_format);
- printk(KERN_INFO
- "GFS2: Use the \"upgrade\" mount option to upgrade "
- "the FS\n");
- printk(KERN_INFO "GFS2: See the manual for more details\n");
- return -EINVAL;
- }
-
- return 0;
-}
-
-
-static void end_bio_io_page(struct bio *bio, int error)
-{
- struct page *page = bio->bi_private;
-
- if (!error)
- SetPageUptodate(page);
- else
- printk(KERN_WARNING "gfs2: error %d reading superblock\n", error);
- unlock_page(page);
-}
-
-static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf)
-{
- const struct gfs2_sb *str = buf;
-
- sb->sb_magic = be32_to_cpu(str->sb_header.mh_magic);
- sb->sb_type = be32_to_cpu(str->sb_header.mh_type);
- sb->sb_format = be32_to_cpu(str->sb_header.mh_format);
- sb->sb_fs_format = be32_to_cpu(str->sb_fs_format);
- sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format);
- sb->sb_bsize = be32_to_cpu(str->sb_bsize);
- sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift);
- sb->sb_master_dir.no_addr = be64_to_cpu(str->sb_master_dir.no_addr);
- sb->sb_master_dir.no_formal_ino = be64_to_cpu(str->sb_master_dir.no_formal_ino);
- sb->sb_root_dir.no_addr = be64_to_cpu(str->sb_root_dir.no_addr);
- sb->sb_root_dir.no_formal_ino = be64_to_cpu(str->sb_root_dir.no_formal_ino);
-
- memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
- memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
-}
-
-/**
- * gfs2_read_super - Read the gfs2 super block from disk
- * @sdp: The GFS2 super block
- * @sector: The location of the super block
- * @error: The error code to return
- *
- * This uses the bio functions to read the super block from disk
- * because we want to be 100% sure that we never read cached data.
- * A super block is read twice only during each GFS2 mount and is
- * never written to by the filesystem. The first time its read no
- * locks are held, and the only details which are looked at are those
- * relating to the locking protocol. Once locking is up and working,
- * the sb is read again under the lock to establish the location of
- * the master directory (contains pointers to journals etc) and the
- * root directory.
- *
- * Returns: 0 on success or error
- */
-
-int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
-{
- struct super_block *sb = sdp->sd_vfs;
- struct gfs2_sb *p;
- struct page *page;
- struct bio *bio;
-
- page = alloc_page(GFP_NOFS);
- if (unlikely(!page))
- return -ENOBUFS;
-
- ClearPageUptodate(page);
- ClearPageDirty(page);
- lock_page(page);
-
- bio = bio_alloc(GFP_NOFS, 1);
- if (unlikely(!bio)) {
- __free_page(page);
- return -ENOBUFS;
- }
-
- bio->bi_sector = sector * (sb->s_blocksize >> 9);
- bio->bi_bdev = sb->s_bdev;
- bio_add_page(bio, page, PAGE_SIZE, 0);
-
- bio->bi_end_io = end_bio_io_page;
- bio->bi_private = page;
- submit_bio(READ_SYNC | (1 << BIO_RW_META), bio);
- wait_on_page_locked(page);
- bio_put(bio);
- if (!PageUptodate(page)) {
- __free_page(page);
- return -EIO;
- }
- p = kmap(page);
- gfs2_sb_in(&sdp->sd_sb, p);
- kunmap(page);
- __free_page(page);
- return 0;
-}
-
-/**
- * gfs2_read_sb - Read super block
- * @sdp: The GFS2 superblock
- * @gl: the glock for the superblock (assumed to be held)
- * @silent: Don't print message if mount fails
- *
- */
-
-int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
-{
- u32 hash_blocks, ind_blocks, leaf_blocks;
- u32 tmp_blocks;
- unsigned int x;
- int error;
-
- error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift);
- if (error) {
- if (!silent)
- fs_err(sdp, "can't read superblock\n");
- return error;
- }
-
- error = gfs2_check_sb(sdp, &sdp->sd_sb, silent);
- if (error)
- return error;
-
- sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift -
- GFS2_BASIC_BLOCK_SHIFT;
- sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
- sdp->sd_diptrs = (sdp->sd_sb.sb_bsize -
- sizeof(struct gfs2_dinode)) / sizeof(u64);
- sdp->sd_inptrs = (sdp->sd_sb.sb_bsize -
- sizeof(struct gfs2_meta_header)) / sizeof(u64);
- sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header);
- sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2;
- sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1;
- sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(u64);
- sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize -
- sizeof(struct gfs2_meta_header)) /
- sizeof(struct gfs2_quota_change);
-
- /* Compute maximum reservation required to add a entry to a directory */
-
- hash_blocks = DIV_ROUND_UP(sizeof(u64) * (1 << GFS2_DIR_MAX_DEPTH),
- sdp->sd_jbsize);
-
- ind_blocks = 0;
- for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) {
- tmp_blocks = DIV_ROUND_UP(tmp_blocks, sdp->sd_inptrs);
- ind_blocks += tmp_blocks;
- }
-
- leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH;
-
- sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks;
-
- sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize -
- sizeof(struct gfs2_dinode);
- sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs;
- for (x = 2;; x++) {
- u64 space, d;
- u32 m;
-
- space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs;
- d = space;
- m = do_div(d, sdp->sd_inptrs);
-
- if (d != sdp->sd_heightsize[x - 1] || m)
- break;
- sdp->sd_heightsize[x] = space;
- }
- sdp->sd_max_height = x;
- sdp->sd_heightsize[x] = ~0;
- gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT);
-
- sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize -
- sizeof(struct gfs2_dinode);
- sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs;
- for (x = 2;; x++) {
- u64 space, d;
- u32 m;
-
- space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs;
- d = space;
- m = do_div(d, sdp->sd_inptrs);
-
- if (d != sdp->sd_jheightsize[x - 1] || m)
- break;
- sdp->sd_jheightsize[x] = space;
- }
- sdp->sd_max_jheight = x;
- sdp->sd_jheightsize[x] = ~0;
- gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
-
- return 0;
-}
-
/**
* gfs2_jindex_hold - Grab a lock on the jindex
* @sdp: The GFS2 superblock
@@ -390,7 +82,7 @@ int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
break;
INIT_LIST_HEAD(&jd->extent_list);
- jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1, NULL);
+ jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
if (!jd->jd_inode)
error = -ENOENT;
@@ -582,39 +274,6 @@ fail:
return error;
}
-/**
- * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
- * @sdp: the filesystem
- *
- * Returns: errno
- */
-
-int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
-{
- struct gfs2_holder t_gh;
- int error;
-
- gfs2_quota_sync(sdp);
- gfs2_statfs_sync(sdp);
-
- error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
- &t_gh);
- if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
- return error;
-
- gfs2_meta_syncfs(sdp);
- gfs2_log_shutdown(sdp);
-
- clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
-
- if (t_gh.gh_gl)
- gfs2_glock_dq_uninit(&t_gh);
-
- gfs2_quota_cleanup(sdp);
-
- return error;
-}
-
static void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf)
{
const struct gfs2_statfs_change *str = buf;
@@ -941,8 +600,7 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
}
error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_DEFERRED,
- LM_FLAG_PRIORITY | GL_NOCACHE,
- t_gh);
+ GL_NOCACHE, t_gh);
list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
error = gfs2_jdesc_check(jd);
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 44361ecc44f..50a4c9b1215 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -12,11 +12,6 @@
#include "incore.h"
-void gfs2_tune_init(struct gfs2_tune *gt);
-
-int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent);
-int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent);
-int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector);
void gfs2_lm_unmount(struct gfs2_sbd *sdp);
static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
@@ -40,7 +35,6 @@ int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
struct gfs2_inode **ipp);
int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
-int gfs2_make_fs_ro(struct gfs2_sbd *sdp);
int gfs2_statfs_init(struct gfs2_sbd *sdp);
void gfs2_statfs_change(struct gfs2_sbd *sdp,
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 9ab9fc85ecd..7e1879f1a02 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -110,18 +110,6 @@ static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf,
return len;
}
-static ssize_t shrink_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
-{
- if (!capable(CAP_SYS_ADMIN))
- return -EACCES;
-
- if (simple_strtol(buf, NULL, 0) != 1)
- return -EINVAL;
-
- gfs2_gl_hash_clear(sdp, NO_WAIT);
- return len;
-}
-
static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
size_t len)
{
@@ -175,7 +163,6 @@ static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store)
GFS2_ATTR(id, 0444, id_show, NULL);
GFS2_ATTR(fsname, 0444, fsname_show, NULL);
GFS2_ATTR(freeze, 0644, freeze_show, freeze_store);
-GFS2_ATTR(shrink, 0200, NULL, shrink_store);
GFS2_ATTR(withdraw, 0644, withdraw_show, withdraw_store);
GFS2_ATTR(statfs_sync, 0200, NULL, statfs_sync_store);
GFS2_ATTR(quota_sync, 0200, NULL, quota_sync_store);
@@ -186,7 +173,6 @@ static struct attribute *gfs2_attrs[] = {
&gfs2_attr_id.attr,
&gfs2_attr_fsname.attr,
&gfs2_attr_freeze.attr,
- &gfs2_attr_shrink.attr,
&gfs2_attr_withdraw.attr,
&gfs2_attr_statfs_sync.attr,
&gfs2_attr_quota_sync.attr,
@@ -283,14 +269,6 @@ ARGS_ATTR(quota, "%u\n");
ARGS_ATTR(suiddir, "%d\n");
ARGS_ATTR(data, "%d\n");
-/* one oddball doesn't fit the macro mold */
-static ssize_t noatime_show(struct gfs2_sbd *sdp, char *buf)
-{
- return snprintf(buf, PAGE_SIZE, "%d\n",
- !!test_bit(SDF_NOATIME, &sdp->sd_flags));
-}
-static struct args_attr args_attr_noatime = __ATTR_RO(noatime);
-
static struct attribute *args_attrs[] = {
&args_attr_lockproto.attr,
&args_attr_locktable.attr,
@@ -306,7 +284,6 @@ static struct attribute *args_attrs[] = {
&args_attr_quota.attr,
&args_attr_suiddir.attr,
&args_attr_data.attr,
- &args_attr_noatime.attr,
NULL,
};
@@ -421,12 +398,10 @@ TUNE_ATTR(incore_log_blocks, 0);
TUNE_ATTR(log_flush_secs, 0);
TUNE_ATTR(quota_warn_period, 0);
TUNE_ATTR(quota_quantum, 0);
-TUNE_ATTR(atime_quantum, 0);
TUNE_ATTR(max_readahead, 0);
TUNE_ATTR(complain_secs, 0);
TUNE_ATTR(statfs_slow, 0);
TUNE_ATTR(new_files_jdata, 0);
-TUNE_ATTR(new_files_directio, 0);
TUNE_ATTR(quota_simul_sync, 1);
TUNE_ATTR(quota_cache_secs, 1);
TUNE_ATTR(stall_secs, 1);
@@ -442,7 +417,6 @@ static struct attribute *tune_attrs[] = {
&tune_attr_log_flush_secs.attr,
&tune_attr_quota_warn_period.attr,
&tune_attr_quota_quantum.attr,
- &tune_attr_atime_quantum.attr,
&tune_attr_max_readahead.attr,
&tune_attr_complain_secs.attr,
&tune_attr_statfs_slow.attr,
@@ -455,7 +429,6 @@ static struct attribute *tune_attrs[] = {
&tune_attr_quotad_secs.attr,
&tune_attr_quota_scale.attr,
&tune_attr_new_files_jdata.attr,
- &tune_attr_new_files_directio.attr,
NULL,
};
diff --git a/fs/hfs/bitmap.c b/fs/hfs/bitmap.c
index 24e75798ddf..c6e97366e8a 100644
--- a/fs/hfs/bitmap.c
+++ b/fs/hfs/bitmap.c
@@ -145,7 +145,7 @@ u32 hfs_vbm_search_free(struct super_block *sb, u32 goal, u32 *num_bits)
if (!*num_bits)
return 0;
- down(&HFS_SB(sb)->bitmap_lock);
+ mutex_lock(&HFS_SB(sb)->bitmap_lock);
bitmap = HFS_SB(sb)->bitmap;
pos = hfs_find_set_zero_bits(bitmap, HFS_SB(sb)->fs_ablocks, goal, num_bits);
@@ -162,7 +162,7 @@ u32 hfs_vbm_search_free(struct super_block *sb, u32 goal, u32 *num_bits)
HFS_SB(sb)->free_ablocks -= *num_bits;
hfs_bitmap_dirty(sb);
out:
- up(&HFS_SB(sb)->bitmap_lock);
+ mutex_unlock(&HFS_SB(sb)->bitmap_lock);
return pos;
}
@@ -205,7 +205,7 @@ int hfs_clear_vbm_bits(struct super_block *sb, u16 start, u16 count)
if ((start + count) > HFS_SB(sb)->fs_ablocks)
return -2;
- down(&HFS_SB(sb)->bitmap_lock);
+ mutex_lock(&HFS_SB(sb)->bitmap_lock);
/* bitmap is always on a 32-bit boundary */
curr = HFS_SB(sb)->bitmap + (start / 32);
len = count;
@@ -236,7 +236,7 @@ int hfs_clear_vbm_bits(struct super_block *sb, u16 start, u16 count)
}
out:
HFS_SB(sb)->free_ablocks += len;
- up(&HFS_SB(sb)->bitmap_lock);
+ mutex_unlock(&HFS_SB(sb)->bitmap_lock);
hfs_bitmap_dirty(sb);
return 0;
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index f6621a78520..9b9d6395bad 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -40,7 +40,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
{
struct hfs_mdb *mdb = HFS_SB(sb)->mdb;
HFS_I(tree->inode)->flags = 0;
- init_MUTEX(&HFS_I(tree->inode)->extents_lock);
+ mutex_init(&HFS_I(tree->inode)->extents_lock);
switch (id) {
case HFS_EXT_CNID:
hfs_inode_read_fork(tree->inode, mdb->drXTExtRec, mdb->drXTFlSize,
diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c
index c176f67ba0a..2c16316d291 100644
--- a/fs/hfs/extent.c
+++ b/fs/hfs/extent.c
@@ -343,16 +343,16 @@ int hfs_get_block(struct inode *inode, sector_t block,
goto done;
}
- down(&HFS_I(inode)->extents_lock);
+ mutex_lock(&HFS_I(inode)->extents_lock);
res = hfs_ext_read_extent(inode, ablock);
if (!res)
dblock = hfs_ext_find_block(HFS_I(inode)->cached_extents,
ablock - HFS_I(inode)->cached_start);
else {
- up(&HFS_I(inode)->extents_lock);
+ mutex_unlock(&HFS_I(inode)->extents_lock);
return -EIO;
}
- up(&HFS_I(inode)->extents_lock);
+ mutex_unlock(&HFS_I(inode)->extents_lock);
done:
map_bh(bh_result, sb, HFS_SB(sb)->fs_start +
@@ -375,7 +375,7 @@ int hfs_extend_file(struct inode *inode)
u32 start, len, goal;
int res;
- down(&HFS_I(inode)->extents_lock);
+ mutex_lock(&HFS_I(inode)->extents_lock);
if (HFS_I(inode)->alloc_blocks == HFS_I(inode)->first_blocks)
goal = hfs_ext_lastblock(HFS_I(inode)->first_extents);
else {
@@ -425,7 +425,7 @@ int hfs_extend_file(struct inode *inode)
goto insert_extent;
}
out:
- up(&HFS_I(inode)->extents_lock);
+ mutex_unlock(&HFS_I(inode)->extents_lock);
if (!res) {
HFS_I(inode)->alloc_blocks += len;
mark_inode_dirty(inode);
@@ -487,7 +487,7 @@ void hfs_file_truncate(struct inode *inode)
if (blk_cnt == alloc_cnt)
goto out;
- down(&HFS_I(inode)->extents_lock);
+ mutex_lock(&HFS_I(inode)->extents_lock);
hfs_find_init(HFS_SB(sb)->ext_tree, &fd);
while (1) {
if (alloc_cnt == HFS_I(inode)->first_blocks) {
@@ -514,7 +514,7 @@ void hfs_file_truncate(struct inode *inode)
hfs_brec_remove(&fd);
}
hfs_find_exit(&fd);
- up(&HFS_I(inode)->extents_lock);
+ mutex_unlock(&HFS_I(inode)->extents_lock);
HFS_I(inode)->alloc_blocks = blk_cnt;
out:
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 147374b6f67..9955232fdf8 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -11,6 +11,7 @@
#include <linux/slab.h>
#include <linux/types.h>
+#include <linux/mutex.h>
#include <linux/buffer_head.h>
#include <linux/fs.h>
@@ -53,7 +54,7 @@ struct hfs_inode_info {
struct list_head open_dir_list;
struct inode *rsrc_inode;
- struct semaphore extents_lock;
+ struct mutex extents_lock;
u16 alloc_blocks, clump_blocks;
sector_t fs_blocks;
@@ -139,7 +140,7 @@ struct hfs_sb_info {
struct nls_table *nls_io, *nls_disk;
- struct semaphore bitmap_lock;
+ struct mutex bitmap_lock;
unsigned long flags;
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 97f8446c4ff..7e19835efa2 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -150,7 +150,7 @@ struct inode *hfs_new_inode(struct inode *dir, struct qstr *name, int mode)
if (!inode)
return NULL;
- init_MUTEX(&HFS_I(inode)->extents_lock);
+ mutex_init(&HFS_I(inode)->extents_lock);
INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list);
hfs_cat_build_key(sb, (btree_key *)&HFS_I(inode)->cat_key, dir->i_ino, name);
inode->i_ino = HFS_SB(sb)->next_id++;
@@ -281,7 +281,7 @@ static int hfs_read_inode(struct inode *inode, void *data)
HFS_I(inode)->flags = 0;
HFS_I(inode)->rsrc_inode = NULL;
- init_MUTEX(&HFS_I(inode)->extents_lock);
+ mutex_init(&HFS_I(inode)->extents_lock);
INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list);
/* Initialize the inode */
@@ -511,8 +511,7 @@ void hfs_clear_inode(struct inode *inode)
}
}
-static int hfs_permission(struct inode *inode, int mask,
- struct nameidata *nd)
+static int hfs_permission(struct inode *inode, int mask)
{
if (S_ISREG(inode->i_mode) && mask & MAY_EXEC)
return 0;
@@ -523,8 +522,6 @@ static int hfs_file_open(struct inode *inode, struct file *file)
{
if (HFS_IS_RSRC(inode))
inode = HFS_I(inode)->rsrc_inode;
- if (atomic_read(&file->f_count) != 1)
- return 0;
atomic_inc(&HFS_I(inode)->opencnt);
return 0;
}
@@ -535,8 +532,6 @@ static int hfs_file_release(struct inode *inode, struct file *file)
if (HFS_IS_RSRC(inode))
inode = HFS_I(inode)->rsrc_inode;
- if (atomic_read(&file->f_count) != 0)
- return 0;
if (atomic_dec_and_test(&HFS_I(inode)->opencnt)) {
mutex_lock(&inode->i_mutex);
hfs_file_truncate(inode);
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 8cf67974adf..3c7c7637719 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -173,7 +173,7 @@ enum {
opt_err
};
-static match_table_t tokens = {
+static const match_table_t tokens = {
{ opt_uid, "uid=%u" },
{ opt_gid, "gid=%u" },
{ opt_umask, "umask=%o" },
@@ -372,7 +372,7 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_op = &hfs_super_operations;
sb->s_flags |= MS_NODIRATIME;
- init_MUTEX(&sbi->bitmap_lock);
+ mutex_init(&sbi->bitmap_lock);
res = hfs_mdb_get(sb);
if (res) {
@@ -432,7 +432,7 @@ static struct file_system_type hfs_fs_type = {
.fs_flags = FS_REQUIRES_DEV,
};
-static void hfs_init_once(struct kmem_cache *cachep, void *p)
+static void hfs_init_once(void *p)
{
struct hfs_inode_info *i = p;
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 12e899cd788..fec8f61227f 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -199,16 +199,16 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
goto done;
}
- down(&HFSPLUS_I(inode).extents_lock);
+ mutex_lock(&HFSPLUS_I(inode).extents_lock);
res = hfsplus_ext_read_extent(inode, ablock);
if (!res) {
dblock = hfsplus_ext_find_block(HFSPLUS_I(inode).cached_extents, ablock -
HFSPLUS_I(inode).cached_start);
} else {
- up(&HFSPLUS_I(inode).extents_lock);
+ mutex_unlock(&HFSPLUS_I(inode).extents_lock);
return -EIO;
}
- up(&HFSPLUS_I(inode).extents_lock);
+ mutex_unlock(&HFSPLUS_I(inode).extents_lock);
done:
dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", inode->i_ino, (long long)iblock, dblock);
@@ -355,7 +355,7 @@ int hfsplus_file_extend(struct inode *inode)
return -ENOSPC;
}
- down(&HFSPLUS_I(inode).extents_lock);
+ mutex_lock(&HFSPLUS_I(inode).extents_lock);
if (HFSPLUS_I(inode).alloc_blocks == HFSPLUS_I(inode).first_blocks)
goal = hfsplus_ext_lastblock(HFSPLUS_I(inode).first_extents);
else {
@@ -408,7 +408,7 @@ int hfsplus_file_extend(struct inode *inode)
goto insert_extent;
}
out:
- up(&HFSPLUS_I(inode).extents_lock);
+ mutex_unlock(&HFSPLUS_I(inode).extents_lock);
if (!res) {
HFSPLUS_I(inode).alloc_blocks += len;
mark_inode_dirty(inode);
@@ -465,7 +465,7 @@ void hfsplus_file_truncate(struct inode *inode)
if (blk_cnt == alloc_cnt)
goto out;
- down(&HFSPLUS_I(inode).extents_lock);
+ mutex_lock(&HFSPLUS_I(inode).extents_lock);
hfs_find_init(HFSPLUS_SB(sb).ext_tree, &fd);
while (1) {
if (alloc_cnt == HFSPLUS_I(inode).first_blocks) {
@@ -492,7 +492,7 @@ void hfsplus_file_truncate(struct inode *inode)
hfs_brec_remove(&fd);
}
hfs_find_exit(&fd);
- up(&HFSPLUS_I(inode).extents_lock);
+ mutex_unlock(&HFSPLUS_I(inode).extents_lock);
HFSPLUS_I(inode).alloc_blocks = blk_cnt;
out:
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 9e59537b43d..f027a905225 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -11,6 +11,7 @@
#define _LINUX_HFSPLUS_FS_H
#include <linux/fs.h>
+#include <linux/mutex.h>
#include <linux/buffer_head.h>
#include "hfsplus_raw.h"
@@ -154,7 +155,7 @@ struct hfsplus_sb_info {
struct hfsplus_inode_info {
- struct semaphore extents_lock;
+ struct mutex extents_lock;
u32 clump_blocks, alloc_blocks;
sector_t fs_blocks;
/* Allocation extents from catalog record or volume header */
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 67e1c8b467c..b085d64a2b6 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -163,7 +163,7 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
inode->i_ino = dir->i_ino;
INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list);
- init_MUTEX(&HFSPLUS_I(inode).extents_lock);
+ mutex_init(&HFSPLUS_I(inode).extents_lock);
HFSPLUS_I(inode).flags = HFSPLUS_FLG_RSRC;
hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
@@ -238,7 +238,7 @@ static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
perms->dev = cpu_to_be32(HFSPLUS_I(inode).dev);
}
-static int hfsplus_permission(struct inode *inode, int mask, struct nameidata *nd)
+static int hfsplus_permission(struct inode *inode, int mask)
{
/* MAY_EXEC is also used for lookup, if no x bit is set allow lookup,
* open_exec has the same test, so it's still not executable, if a x bit
@@ -254,8 +254,6 @@ static int hfsplus_file_open(struct inode *inode, struct file *file)
{
if (HFSPLUS_IS_RSRC(inode))
inode = HFSPLUS_I(inode).rsrc_inode;
- if (atomic_read(&file->f_count) != 1)
- return 0;
atomic_inc(&HFSPLUS_I(inode).opencnt);
return 0;
}
@@ -266,8 +264,6 @@ static int hfsplus_file_release(struct inode *inode, struct file *file)
if (HFSPLUS_IS_RSRC(inode))
inode = HFSPLUS_I(inode).rsrc_inode;
- if (atomic_read(&file->f_count) != 0)
- return 0;
if (atomic_dec_and_test(&HFSPLUS_I(inode).opencnt)) {
mutex_lock(&inode->i_mutex);
hfsplus_file_truncate(inode);
@@ -316,7 +312,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
inode->i_nlink = 1;
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list);
- init_MUTEX(&HFSPLUS_I(inode).extents_lock);
+ mutex_init(&HFSPLUS_I(inode).extents_lock);
atomic_set(&HFSPLUS_I(inode).opencnt, 0);
HFSPLUS_I(inode).flags = 0;
memset(HFSPLUS_I(inode).first_extents, 0, sizeof(hfsplus_extent_rec));
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 9997cbf8beb..9699c56d323 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -25,7 +25,7 @@ enum {
opt_force, opt_err
};
-static match_table_t tokens = {
+static const match_table_t tokens = {
{ opt_creator, "creator=%s" },
{ opt_type, "type=%s" },
{ opt_umask, "umask=%o" },
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index ce97a54518d..e834e578c93 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -34,7 +34,7 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
return inode;
INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list);
- init_MUTEX(&HFSPLUS_I(inode).extents_lock);
+ mutex_init(&HFSPLUS_I(inode).extents_lock);
HFSPLUS_I(inode).flags = 0;
HFSPLUS_I(inode).rsrc_inode = NULL;
atomic_set(&HFSPLUS_I(inode).opencnt, 0);
@@ -485,7 +485,7 @@ static struct file_system_type hfsplus_fs_type = {
.fs_flags = FS_REQUIRES_DEV,
};
-static void hfsplus_init_once(struct kmem_cache *cachep, void *p)
+static void hfsplus_init_once(void *p)
{
struct hfsplus_inode_info *i = p;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 5222345ddcc..d6ecabf4d23 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -822,7 +822,7 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from,
return err;
}
-int hostfs_permission(struct inode *ino, int desired, struct nameidata *nd)
+int hostfs_permission(struct inode *ino, int desired)
{
char *name;
int r = 0, w = 0, x = 0, err;
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index d256559b410..d9c59a77544 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -415,7 +415,7 @@ again:
d_drop(dentry);
spin_lock(&dentry->d_lock);
if (atomic_read(&dentry->d_count) > 1 ||
- permission(inode, MAY_WRITE, NULL) ||
+ generic_permission(inode, MAY_WRITE, NULL) ||
!S_ISREG(inode->i_mode) ||
get_write_access(inode)) {
spin_unlock(&dentry->d_lock);
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index f63a699ec65..29ad461d568 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -173,7 +173,7 @@ static void hpfs_destroy_inode(struct inode *inode)
kmem_cache_free(hpfs_inode_cachep, hpfs_i(inode));
}
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
{
struct hpfs_inode_info *ei = (struct hpfs_inode_info *) foo;
@@ -215,7 +215,7 @@ enum {
Opt_timeshift, Opt_err,
};
-static match_table_t tokens = {
+static const match_table_t tokens = {
{Opt_help, "help"},
{Opt_uid, "uid=%u"},
{Opt_gid, "gid=%u"},
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 65077aa90f0..2b3d1828db9 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -655,20 +655,13 @@ static void *hppfs_follow_link(struct dentry *dentry, struct nameidata *nd)
return proc_dentry->d_inode->i_op->follow_link(proc_dentry, nd);
}
-int hppfs_permission(struct inode *inode, int mask, struct nameidata *nd)
-{
- return generic_permission(inode, mask, NULL);
-}
-
static const struct inode_operations hppfs_dir_iops = {
.lookup = hppfs_lookup,
- .permission = hppfs_permission,
};
static const struct inode_operations hppfs_link_iops = {
.readlink = hppfs_readlink,
.follow_link = hppfs_follow_link,
- .permission = hppfs_permission,
};
static struct inode *get_inode(struct super_block *sb, struct dentry *dentry)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index aeabf80f81a..61edc701b0e 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -53,15 +53,17 @@ int sysctl_hugetlb_shm_group;
enum {
Opt_size, Opt_nr_inodes,
Opt_mode, Opt_uid, Opt_gid,
+ Opt_pagesize,
Opt_err,
};
-static match_table_t tokens = {
+static const match_table_t tokens = {
{Opt_size, "size=%s"},
{Opt_nr_inodes, "nr_inodes=%s"},
{Opt_mode, "mode=%o"},
{Opt_uid, "uid=%u"},
{Opt_gid, "gid=%u"},
+ {Opt_pagesize, "pagesize=%s"},
{Opt_err, NULL},
};
@@ -80,6 +82,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
struct inode *inode = file->f_path.dentry->d_inode;
loff_t len, vma_len;
int ret;
+ struct hstate *h = hstate_file(file);
/*
* vma address alignment (but not the pgoff alignment) has
@@ -92,7 +95,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
vma->vm_flags |= VM_HUGETLB | VM_RESERVED;
vma->vm_ops = &hugetlb_vm_ops;
- if (vma->vm_pgoff & ~(HPAGE_MASK >> PAGE_SHIFT))
+ if (vma->vm_pgoff & ~(huge_page_mask(h) >> PAGE_SHIFT))
return -EINVAL;
vma_len = (loff_t)(vma->vm_end - vma->vm_start);
@@ -103,9 +106,9 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
ret = -ENOMEM;
len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
- if (vma->vm_flags & VM_MAYSHARE &&
- hugetlb_reserve_pages(inode, vma->vm_pgoff >> (HPAGE_SHIFT-PAGE_SHIFT),
- len >> HPAGE_SHIFT))
+ if (hugetlb_reserve_pages(inode,
+ vma->vm_pgoff >> huge_page_order(h),
+ len >> huge_page_shift(h), vma))
goto out;
ret = 0;
@@ -130,20 +133,21 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
unsigned long start_addr;
+ struct hstate *h = hstate_file(file);
- if (len & ~HPAGE_MASK)
+ if (len & ~huge_page_mask(h))
return -EINVAL;
if (len > TASK_SIZE)
return -ENOMEM;
if (flags & MAP_FIXED) {
- if (prepare_hugepage_range(addr, len))
+ if (prepare_hugepage_range(file, addr, len))
return -EINVAL;
return addr;
}
if (addr) {
- addr = ALIGN(addr, HPAGE_SIZE);
+ addr = ALIGN(addr, huge_page_size(h));
vma = find_vma(mm, addr);
if (TASK_SIZE - len >= addr &&
(!vma || addr + len <= vma->vm_start))
@@ -156,7 +160,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
start_addr = TASK_UNMAPPED_BASE;
full_search:
- addr = ALIGN(start_addr, HPAGE_SIZE);
+ addr = ALIGN(start_addr, huge_page_size(h));
for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
/* At this point: (!vma || addr < vma->vm_end). */
@@ -174,7 +178,7 @@ full_search:
if (!vma || addr + len <= vma->vm_start)
return addr;
- addr = ALIGN(vma->vm_end, HPAGE_SIZE);
+ addr = ALIGN(vma->vm_end, huge_page_size(h));
}
}
#endif
@@ -225,10 +229,11 @@ hugetlbfs_read_actor(struct page *page, unsigned long offset,
static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
size_t len, loff_t *ppos)
{
+ struct hstate *h = hstate_file(filp);
struct address_space *mapping = filp->f_mapping;
struct inode *inode = mapping->host;
- unsigned long index = *ppos >> HPAGE_SHIFT;
- unsigned long offset = *ppos & ~HPAGE_MASK;
+ unsigned long index = *ppos >> huge_page_shift(h);
+ unsigned long offset = *ppos & ~huge_page_mask(h);
unsigned long end_index;
loff_t isize;
ssize_t retval = 0;
@@ -243,17 +248,17 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
if (!isize)
goto out;
- end_index = (isize - 1) >> HPAGE_SHIFT;
+ end_index = (isize - 1) >> huge_page_shift(h);
for (;;) {
struct page *page;
- int nr, ret;
+ unsigned long nr, ret;
/* nr is the maximum number of bytes to copy from this page */
- nr = HPAGE_SIZE;
+ nr = huge_page_size(h);
if (index >= end_index) {
if (index > end_index)
goto out;
- nr = ((isize - 1) & ~HPAGE_MASK) + 1;
+ nr = ((isize - 1) & ~huge_page_mask(h)) + 1;
if (nr <= offset) {
goto out;
}
@@ -287,8 +292,8 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
offset += ret;
retval += ret;
len -= ret;
- index += offset >> HPAGE_SHIFT;
- offset &= ~HPAGE_MASK;
+ index += offset >> huge_page_shift(h);
+ offset &= ~huge_page_mask(h);
if (page)
page_cache_release(page);
@@ -298,7 +303,7 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf,
break;
}
out:
- *ppos = ((loff_t)index << HPAGE_SHIFT) + offset;
+ *ppos = ((loff_t)index << huge_page_shift(h)) + offset;
mutex_unlock(&inode->i_mutex);
return retval;
}
@@ -339,8 +344,9 @@ static void truncate_huge_page(struct page *page)
static void truncate_hugepages(struct inode *inode, loff_t lstart)
{
+ struct hstate *h = hstate_inode(inode);
struct address_space *mapping = &inode->i_data;
- const pgoff_t start = lstart >> HPAGE_SHIFT;
+ const pgoff_t start = lstart >> huge_page_shift(h);
struct pagevec pvec;
pgoff_t next;
int i, freed = 0;
@@ -441,7 +447,7 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, pgoff_t pgoff)
v_offset = 0;
__unmap_hugepage_range(vma,
- vma->vm_start + v_offset, vma->vm_end);
+ vma->vm_start + v_offset, vma->vm_end, NULL);
}
}
@@ -449,8 +455,9 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
{
pgoff_t pgoff;
struct address_space *mapping = inode->i_mapping;
+ struct hstate *h = hstate_inode(inode);
- BUG_ON(offset & ~HPAGE_MASK);
+ BUG_ON(offset & ~huge_page_mask(h));
pgoff = offset >> PAGE_SHIFT;
i_size_write(inode, offset);
@@ -465,6 +472,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = dentry->d_inode;
+ struct hstate *h = hstate_inode(inode);
int error;
unsigned int ia_valid = attr->ia_valid;
@@ -476,7 +484,7 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
if (ia_valid & ATTR_SIZE) {
error = -EINVAL;
- if (!(attr->ia_size & ~HPAGE_MASK))
+ if (!(attr->ia_size & ~huge_page_mask(h)))
error = hugetlb_vmtruncate(inode, attr->ia_size);
if (error)
goto out;
@@ -610,9 +618,10 @@ static int hugetlbfs_set_page_dirty(struct page *page)
static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
+ struct hstate *h = hstate_inode(dentry->d_inode);
buf->f_type = HUGETLBFS_MAGIC;
- buf->f_bsize = HPAGE_SIZE;
+ buf->f_bsize = huge_page_size(h);
if (sbinfo) {
spin_lock(&sbinfo->stat_lock);
/* If no limits set, just report 0 for max/free/used
@@ -696,7 +705,7 @@ static const struct address_space_operations hugetlbfs_aops = {
};
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
{
struct hugetlbfs_inode_info *ei = (struct hugetlbfs_inode_info *)foo;
@@ -743,6 +752,8 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
char *p, *rest;
substring_t args[MAX_OPT_ARGS];
int option;
+ unsigned long long size = 0;
+ enum { NO_SIZE, SIZE_STD, SIZE_PERCENT } setsize = NO_SIZE;
if (!options)
return 0;
@@ -773,17 +784,13 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
break;
case Opt_size: {
- unsigned long long size;
/* memparse() will accept a K/M/G without a digit */
if (!isdigit(*args[0].from))
goto bad_val;
size = memparse(args[0].from, &rest);
- if (*rest == '%') {
- size <<= HPAGE_SHIFT;
- size *= max_huge_pages;
- do_div(size, 100);
- }
- pconfig->nr_blocks = (size >> HPAGE_SHIFT);
+ setsize = SIZE_STD;
+ if (*rest == '%')
+ setsize = SIZE_PERCENT;
break;
}
@@ -794,6 +801,19 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
pconfig->nr_inodes = memparse(args[0].from, &rest);
break;
+ case Opt_pagesize: {
+ unsigned long ps;
+ ps = memparse(args[0].from, &rest);
+ pconfig->hstate = size_to_hstate(ps);
+ if (!pconfig->hstate) {
+ printk(KERN_ERR
+ "hugetlbfs: Unsupported page size %lu MB\n",
+ ps >> 20);
+ return -EINVAL;
+ }
+ break;
+ }
+
default:
printk(KERN_ERR "hugetlbfs: Bad mount option: \"%s\"\n",
p);
@@ -801,6 +821,18 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
break;
}
}
+
+ /* Do size after hstate is set up */
+ if (setsize > NO_SIZE) {
+ struct hstate *h = pconfig->hstate;
+ if (setsize == SIZE_PERCENT) {
+ size <<= huge_page_shift(h);
+ size *= h->max_huge_pages;
+ do_div(size, 100);
+ }
+ pconfig->nr_blocks = (size >> huge_page_shift(h));
+ }
+
return 0;
bad_val:
@@ -825,6 +857,7 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
config.uid = current->fsuid;
config.gid = current->fsgid;
config.mode = 0755;
+ config.hstate = &default_hstate;
ret = hugetlbfs_parse_options(data, &config);
if (ret)
return ret;
@@ -833,14 +866,15 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
if (!sbinfo)
return -ENOMEM;
sb->s_fs_info = sbinfo;
+ sbinfo->hstate = config.hstate;
spin_lock_init(&sbinfo->stat_lock);
sbinfo->max_blocks = config.nr_blocks;
sbinfo->free_blocks = config.nr_blocks;
sbinfo->max_inodes = config.nr_inodes;
sbinfo->free_inodes = config.nr_inodes;
sb->s_maxbytes = MAX_LFS_FILESIZE;
- sb->s_blocksize = HPAGE_SIZE;
- sb->s_blocksize_bits = HPAGE_SHIFT;
+ sb->s_blocksize = huge_page_size(config.hstate);
+ sb->s_blocksize_bits = huge_page_shift(config.hstate);
sb->s_magic = HUGETLBFS_MAGIC;
sb->s_op = &hugetlbfs_ops;
sb->s_time_gran = 1;
@@ -942,7 +976,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size)
goto out_dentry;
error = -ENOMEM;
- if (hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT))
+ if (hugetlb_reserve_pages(inode, 0,
+ size >> huge_page_shift(hstate_inode(inode)), NULL))
goto out_inode;
d_instantiate(dentry, inode);
diff --git a/fs/inode.c b/fs/inode.c
index c36d9480335..0487ddba139 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -166,6 +166,7 @@ static struct inode *alloc_inode(struct super_block *sb)
mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE);
mapping->assoc_mapping = NULL;
mapping->backing_dev_info = &default_backing_dev_info;
+ mapping->writeback_index = 0;
/*
* If the block_device provides a backing_dev_info for client
@@ -209,7 +210,7 @@ void inode_init_once(struct inode *inode)
INIT_LIST_HEAD(&inode->i_dentry);
INIT_LIST_HEAD(&inode->i_devices);
INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
- rwlock_init(&inode->i_data.tree_lock);
+ spin_lock_init(&inode->i_data.tree_lock);
spin_lock_init(&inode->i_data.i_mmap_lock);
INIT_LIST_HEAD(&inode->i_data.private_list);
spin_lock_init(&inode->i_data.private_lock);
@@ -224,7 +225,7 @@ void inode_init_once(struct inode *inode)
EXPORT_SYMBOL(inode_init_once);
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
{
struct inode * inode = (struct inode *) foo;
diff --git a/fs/inotify_user.c b/fs/inotify_user.c
index 6676c06bb7c..d85c7d931cd 100644
--- a/fs/inotify_user.c
+++ b/fs/inotify_user.c
@@ -323,7 +323,7 @@ out:
}
/*
- * remove_kevent - cleans up and ultimately frees the given kevent
+ * remove_kevent - cleans up the given kevent
*
* Caller must hold dev->ev_mutex.
*/
@@ -334,7 +334,13 @@ static void remove_kevent(struct inotify_device *dev,
dev->event_count--;
dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len;
+}
+/*
+ * free_kevent - frees the given kevent.
+ */
+static void free_kevent(struct inotify_kernel_event *kevent)
+{
kfree(kevent->name);
kmem_cache_free(event_cachep, kevent);
}
@@ -350,24 +356,25 @@ static void inotify_dev_event_dequeue(struct inotify_device *dev)
struct inotify_kernel_event *kevent;
kevent = inotify_dev_get_event(dev);
remove_kevent(dev, kevent);
+ free_kevent(kevent);
}
}
/*
- * find_inode - resolve a user-given path to a specific inode and return a nd
+ * find_inode - resolve a user-given path to a specific inode
*/
-static int find_inode(const char __user *dirname, struct nameidata *nd,
+static int find_inode(const char __user *dirname, struct path *path,
unsigned flags)
{
int error;
- error = __user_walk(dirname, flags, nd);
+ error = user_path_at(AT_FDCWD, dirname, flags, path);
if (error)
return error;
/* you can only watch an inode if you have read permissions on it */
- error = vfs_permission(nd, MAY_READ);
+ error = inode_permission(path->dentry->d_inode, MAY_READ);
if (error)
- path_put(&nd->path);
+ path_put(path);
return error;
}
@@ -433,17 +440,15 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
dev = file->private_data;
while (1) {
- int events;
prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE);
mutex_lock(&dev->ev_mutex);
- events = !list_empty(&dev->events);
- mutex_unlock(&dev->ev_mutex);
- if (events) {
+ if (!list_empty(&dev->events)) {
ret = 0;
break;
}
+ mutex_unlock(&dev->ev_mutex);
if (file->f_flags & O_NONBLOCK) {
ret = -EAGAIN;
@@ -462,7 +467,6 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
if (ret)
return ret;
- mutex_lock(&dev->ev_mutex);
while (1) {
struct inotify_kernel_event *kevent;
@@ -481,6 +485,13 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
}
break;
}
+ remove_kevent(dev, kevent);
+
+ /*
+ * Must perform the copy_to_user outside the mutex in order
+ * to avoid a lock order reversal with mmap_sem.
+ */
+ mutex_unlock(&dev->ev_mutex);
if (copy_to_user(buf, &kevent->event, event_size)) {
ret = -EFAULT;
@@ -498,7 +509,9 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
count -= kevent->event.len;
}
- remove_kevent(dev, kevent);
+ free_kevent(kevent);
+
+ mutex_lock(&dev->ev_mutex);
}
mutex_unlock(&dev->ev_mutex);
@@ -566,7 +579,7 @@ static const struct inotify_operations inotify_user_ops = {
.destroy_watch = free_inotify_user_watch,
};
-asmlinkage long sys_inotify_init(void)
+asmlinkage long sys_inotify_init1(int flags)
{
struct inotify_device *dev;
struct inotify_handle *ih;
@@ -574,7 +587,14 @@ asmlinkage long sys_inotify_init(void)
struct file *filp;
int fd, ret;
- fd = get_unused_fd();
+ /* Check the IN_* constants for consistency. */
+ BUILD_BUG_ON(IN_CLOEXEC != O_CLOEXEC);
+ BUILD_BUG_ON(IN_NONBLOCK != O_NONBLOCK);
+
+ if (flags & ~(IN_CLOEXEC | IN_NONBLOCK))
+ return -EINVAL;
+
+ fd = get_unused_fd_flags(flags & O_CLOEXEC);
if (fd < 0)
return fd;
@@ -610,7 +630,7 @@ asmlinkage long sys_inotify_init(void)
filp->f_path.dentry = dget(inotify_mnt->mnt_root);
filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
filp->f_mode = FMODE_READ;
- filp->f_flags = O_RDONLY;
+ filp->f_flags = O_RDONLY | (flags & O_NONBLOCK);
filp->private_data = dev;
INIT_LIST_HEAD(&dev->events);
@@ -638,11 +658,16 @@ out_put_fd:
return ret;
}
-asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
+asmlinkage long sys_inotify_init(void)
+{
+ return sys_inotify_init1(0);
+}
+
+asmlinkage long sys_inotify_add_watch(int fd, const char __user *pathname, u32 mask)
{
struct inode *inode;
struct inotify_device *dev;
- struct nameidata nd;
+ struct path path;
struct file *filp;
int ret, fput_needed;
unsigned flags = 0;
@@ -662,12 +687,12 @@ asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
if (mask & IN_ONLYDIR)
flags |= LOOKUP_DIRECTORY;
- ret = find_inode(path, &nd, flags);
+ ret = find_inode(pathname, &path, flags);
if (unlikely(ret))
goto fput_and_out;
- /* inode held in place by reference to nd; dev by fget on fd */
- inode = nd.path.dentry->d_inode;
+ /* inode held in place by reference to path; dev by fget on fd */
+ inode = path.dentry->d_inode;
dev = filp->private_data;
mutex_lock(&dev->up_mutex);
@@ -676,7 +701,7 @@ asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
ret = create_watch(dev, inode, mask);
mutex_unlock(&dev->up_mutex);
- path_put(&nd.path);
+ path_put(&path);
fput_and_out:
fput_light(filp, fput_needed);
return ret;
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 7db32b3382d..d152856c371 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -13,9 +13,14 @@
#include <linux/security.h>
#include <linux/module.h>
#include <linux/uaccess.h>
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>
#include <asm/ioctls.h>
+/* So that the fiemap access checks can't overflow on 32 bit machines. */
+#define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent))
+
/**
* vfs_ioctl - call filesystem specific ioctl methods
* @filp: open file to invoke ioctl method on
@@ -71,6 +76,276 @@ static int ioctl_fibmap(struct file *filp, int __user *p)
return put_user(res, p);
}
+/**
+ * fiemap_fill_next_extent - Fiemap helper function
+ * @fieinfo: Fiemap context passed into ->fiemap
+ * @logical: Extent logical start offset, in bytes
+ * @phys: Extent physical start offset, in bytes
+ * @len: Extent length, in bytes
+ * @flags: FIEMAP_EXTENT flags that describe this extent
+ *
+ * Called from file system ->fiemap callback. Will populate extent
+ * info as passed in via arguments and copy to user memory. On
+ * success, extent count on fieinfo is incremented.
+ *
+ * Returns 0 on success, -errno on error, 1 if this was the last
+ * extent that will fit in user array.
+ */
+#define SET_UNKNOWN_FLAGS (FIEMAP_EXTENT_DELALLOC)
+#define SET_NO_UNMOUNTED_IO_FLAGS (FIEMAP_EXTENT_DATA_ENCRYPTED)
+#define SET_NOT_ALIGNED_FLAGS (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE)
+int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical,
+ u64 phys, u64 len, u32 flags)
+{
+ struct fiemap_extent extent;
+ struct fiemap_extent *dest = fieinfo->fi_extents_start;
+
+ /* only count the extents */
+ if (fieinfo->fi_extents_max == 0) {
+ fieinfo->fi_extents_mapped++;
+ return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0;
+ }
+
+ if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max)
+ return 1;
+
+ if (flags & SET_UNKNOWN_FLAGS)
+ flags |= FIEMAP_EXTENT_UNKNOWN;
+ if (flags & SET_NO_UNMOUNTED_IO_FLAGS)
+ flags |= FIEMAP_EXTENT_ENCODED;
+ if (flags & SET_NOT_ALIGNED_FLAGS)
+ flags |= FIEMAP_EXTENT_NOT_ALIGNED;
+
+ memset(&extent, 0, sizeof(extent));
+ extent.fe_logical = logical;
+ extent.fe_physical = phys;
+ extent.fe_length = len;
+ extent.fe_flags = flags;
+
+ dest += fieinfo->fi_extents_mapped;
+ if (copy_to_user(dest, &extent, sizeof(extent)))
+ return -EFAULT;
+
+ fieinfo->fi_extents_mapped++;
+ if (fieinfo->fi_extents_mapped == fieinfo->fi_extents_max)
+ return 1;
+ return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0;
+}
+EXPORT_SYMBOL(fiemap_fill_next_extent);
+
+/**
+ * fiemap_check_flags - check validity of requested flags for fiemap
+ * @fieinfo: Fiemap context passed into ->fiemap
+ * @fs_flags: Set of fiemap flags that the file system understands
+ *
+ * Called from file system ->fiemap callback. This will compute the
+ * intersection of valid fiemap flags and those that the fs supports. That
+ * value is then compared against the user supplied flags. In case of bad user
+ * flags, the invalid values will be written into the fieinfo structure, and
+ * -EBADR is returned, which tells ioctl_fiemap() to return those values to
+ * userspace. For this reason, a return code of -EBADR should be preserved.
+ *
+ * Returns 0 on success, -EBADR on bad flags.
+ */
+int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags)
+{
+ u32 incompat_flags;
+
+ incompat_flags = fieinfo->fi_flags & ~(FIEMAP_FLAGS_COMPAT & fs_flags);
+ if (incompat_flags) {
+ fieinfo->fi_flags = incompat_flags;
+ return -EBADR;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(fiemap_check_flags);
+
+static int fiemap_check_ranges(struct super_block *sb,
+ u64 start, u64 len, u64 *new_len)
+{
+ *new_len = len;
+
+ if (len == 0)
+ return -EINVAL;
+
+ if (start > sb->s_maxbytes)
+ return -EFBIG;
+
+ /*
+ * Shrink request scope to what the fs can actually handle.
+ */
+ if ((len > sb->s_maxbytes) ||
+ (sb->s_maxbytes - len) < start)
+ *new_len = sb->s_maxbytes - start;
+
+ return 0;
+}
+
+static int ioctl_fiemap(struct file *filp, unsigned long arg)
+{
+ struct fiemap fiemap;
+ struct fiemap_extent_info fieinfo = { 0, };
+ struct inode *inode = filp->f_path.dentry->d_inode;
+ struct super_block *sb = inode->i_sb;
+ u64 len;
+ int error;
+
+ if (!inode->i_op->fiemap)
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&fiemap, (struct fiemap __user *)arg,
+ sizeof(struct fiemap)))
+ return -EFAULT;
+
+ if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
+ return -EINVAL;
+
+ error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length,
+ &len);
+ if (error)
+ return error;
+
+ fieinfo.fi_flags = fiemap.fm_flags;
+ fieinfo.fi_extents_max = fiemap.fm_extent_count;
+ fieinfo.fi_extents_start = (struct fiemap_extent *)(arg + sizeof(fiemap));
+
+ if (fiemap.fm_extent_count != 0 &&
+ !access_ok(VERIFY_WRITE, fieinfo.fi_extents_start,
+ fieinfo.fi_extents_max * sizeof(struct fiemap_extent)))
+ return -EFAULT;
+
+ if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC)
+ filemap_write_and_wait(inode->i_mapping);
+
+ error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, len);
+ fiemap.fm_flags = fieinfo.fi_flags;
+ fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
+ if (copy_to_user((char *)arg, &fiemap, sizeof(fiemap)))
+ error = -EFAULT;
+
+ return error;
+}
+
+#ifdef CONFIG_BLOCK
+
+#define blk_to_logical(inode, blk) (blk << (inode)->i_blkbits)
+#define logical_to_blk(inode, offset) (offset >> (inode)->i_blkbits);
+
+/*
+ * @inode - the inode to map
+ * @arg - the pointer to userspace where we copy everything to
+ * @get_block - the fs's get_block function
+ *
+ * This does FIEMAP for block based inodes. Basically it will just loop
+ * through get_block until we hit the number of extents we want to map, or we
+ * go past the end of the file and hit a hole.
+ *
+ * If it is possible to have data blocks beyond a hole past @inode->i_size, then
+ * please do not use this function, it will stop at the first unmapped block
+ * beyond i_size
+ */
+int generic_block_fiemap(struct inode *inode,
+ struct fiemap_extent_info *fieinfo, u64 start,
+ u64 len, get_block_t *get_block)
+{
+ struct buffer_head tmp;
+ unsigned int start_blk;
+ long long length = 0, map_len = 0;
+ u64 logical = 0, phys = 0, size = 0;
+ u32 flags = FIEMAP_EXTENT_MERGED;
+ int ret = 0;
+
+ if ((ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC)))
+ return ret;
+
+ start_blk = logical_to_blk(inode, start);
+
+ /* guard against change */
+ mutex_lock(&inode->i_mutex);
+
+ length = (long long)min_t(u64, len, i_size_read(inode));
+ map_len = length;
+
+ do {
+ /*
+ * we set b_size to the total size we want so it will map as
+ * many contiguous blocks as possible at once
+ */
+ memset(&tmp, 0, sizeof(struct buffer_head));
+ tmp.b_size = map_len;
+
+ ret = get_block(inode, start_blk, &tmp, 0);
+ if (ret)
+ break;
+
+ /* HOLE */
+ if (!buffer_mapped(&tmp)) {
+ /*
+ * first hole after going past the EOF, this is our
+ * last extent
+ */
+ if (length <= 0) {
+ flags = FIEMAP_EXTENT_MERGED|FIEMAP_EXTENT_LAST;
+ ret = fiemap_fill_next_extent(fieinfo, logical,
+ phys, size,
+ flags);
+ break;
+ }
+
+ length -= blk_to_logical(inode, 1);
+
+ /* if we have holes up to/past EOF then we're done */
+ if (length <= 0)
+ break;
+
+ start_blk++;
+ } else {
+ if (length <= 0 && size) {
+ ret = fiemap_fill_next_extent(fieinfo, logical,
+ phys, size,
+ flags);
+ if (ret)
+ break;
+ }
+
+ logical = blk_to_logical(inode, start_blk);
+ phys = blk_to_logical(inode, tmp.b_blocknr);
+ size = tmp.b_size;
+ flags = FIEMAP_EXTENT_MERGED;
+
+ length -= tmp.b_size;
+ start_blk += logical_to_blk(inode, size);
+
+ /*
+ * if we are past the EOF we need to loop again to see
+ * if there is a hole so we can mark this extent as the
+ * last one, and if not keep mapping things until we
+ * find a hole, or we run out of slots in the extent
+ * array
+ */
+ if (length <= 0)
+ continue;
+
+ ret = fiemap_fill_next_extent(fieinfo, logical, phys,
+ size, flags);
+ if (ret)
+ break;
+ }
+ cond_resched();
+ } while (1);
+
+ mutex_unlock(&inode->i_mutex);
+
+ /* if ret is 1 then we just hit the end of the extent array */
+ if (ret == 1)
+ ret = 0;
+
+ return ret;
+}
+EXPORT_SYMBOL(generic_block_fiemap);
+
+#endif /* CONFIG_BLOCK */
+
static int file_ioctl(struct file *filp, unsigned int cmd,
unsigned long arg)
{
@@ -80,6 +355,8 @@ static int file_ioctl(struct file *filp, unsigned int cmd,
switch (cmd) {
case FIBMAP:
return ioctl_fibmap(filp, p);
+ case FS_IOC_FIEMAP:
+ return ioctl_fiemap(filp, arg);
case FIGETBSZ:
return put_user(inode->i_sb->s_blocksize, p);
case FIONREAD:
diff --git a/fs/ioprio.c b/fs/ioprio.c
index c4a1c3c65aa..da3cc460d4d 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -115,11 +115,11 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
pgrp = task_pgrp(current);
else
pgrp = find_vpid(who);
- do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
+ do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
ret = set_task_ioprio(p, ioprio);
if (ret)
break;
- } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
+ } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
break;
case IOPRIO_WHO_USER:
if (!who)
@@ -204,7 +204,7 @@ asmlinkage long sys_ioprio_get(int which, int who)
pgrp = task_pgrp(current);
else
pgrp = find_vpid(who);
- do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
+ do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
tmpio = get_task_ioprio(p);
if (tmpio < 0)
continue;
@@ -212,7 +212,7 @@ asmlinkage long sys_ioprio_get(int which, int who)
ret = tmpio;
else
ret = ioprio_best(ret, tmpio);
- } while_each_pid_task(pgrp, PIDTYPE_PGID, p);
+ } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
break;
case IOPRIO_WHO_USER:
if (!who)
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 044a254d526..3f8af0f1505 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -73,7 +73,7 @@ static void isofs_destroy_inode(struct inode *inode)
kmem_cache_free(isofs_inode_cachep, ISOFS_I(inode));
}
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
{
struct iso_inode_info *ei = foo;
@@ -310,7 +310,7 @@ enum {
Opt_nocompress, Opt_hide, Opt_showassoc, Opt_dmode,
};
-static match_table_t tokens = {
+static const match_table_t tokens = {
{Opt_norock, "norock"},
{Opt_nojoliet, "nojoliet"},
{Opt_unhide, "unhide"},
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 6bd48f0a704..c2fb2dd0131 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -209,6 +209,11 @@ repeat:
while (rs.len > 2) { /* There may be one byte for padding somewhere */
rr = (struct rock_ridge *)rs.chr;
+ /*
+ * Ignore rock ridge info if rr->len is out of range, but
+ * don't return -EIO because that would make the file
+ * invisible.
+ */
if (rr->len < 3)
goto out; /* Something got screwed up here */
sig = isonum_721(rs.chr);
@@ -216,8 +221,12 @@ repeat:
goto eio;
rs.chr += rr->len;
rs.len -= rr->len;
+ /*
+ * As above, just ignore the rock ridge info if rr->len
+ * is bogus.
+ */
if (rs.len < 0)
- goto eio; /* corrupted isofs */
+ goto out; /* Something got screwed up here */
switch (sig) {
case SIG('R', 'R'):
@@ -307,6 +316,11 @@ parse_rock_ridge_inode_internal(struct iso_directory_record *de,
repeat:
while (rs.len > 2) { /* There may be one byte for padding somewhere */
rr = (struct rock_ridge *)rs.chr;
+ /*
+ * Ignore rock ridge info if rr->len is out of range, but
+ * don't return -EIO because that would make the file
+ * invisible.
+ */
if (rr->len < 3)
goto out; /* Something got screwed up here */
sig = isonum_721(rs.chr);
@@ -314,8 +328,12 @@ repeat:
goto eio;
rs.chr += rr->len;
rs.len -= rr->len;
+ /*
+ * As above, just ignore the rock ridge info if rr->len
+ * is bogus.
+ */
if (rs.len < 0)
- goto eio; /* corrupted isofs */
+ goto out; /* Something got screwed up here */
switch (sig) {
#ifndef CONFIG_ZISOFS /* No flag for SF or ZF */
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 5a8ca61498c..ae08c057e75 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -36,7 +36,7 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
/*
* When an ext3-ordered file is truncated, it is possible that many pages are
- * not sucessfully freed, because they are attached to a committing transaction.
+ * not successfully freed, because they are attached to a committing transaction.
* After the transaction commits, these pages are left on the LRU, with no
* ->mapping, and with attached buffers. These pages are trivially reclaimable
* by the VM, but their apparent absence upsets the VM accounting, and it makes
@@ -45,8 +45,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
* So here, we have a buffer which has just come off the forget list. Look to
* see if we can strip all buffers from the backing page.
*
- * Called under lock_journal(), and possibly under journal_datalist_lock. The
- * caller provided us with a ref against the buffer, and we drop that here.
+ * Called under journal->j_list_lock. The caller provided us with a ref
+ * against the buffer, and we drop that here.
*/
static void release_buffer_page(struct buffer_head *bh)
{
@@ -63,7 +63,7 @@ static void release_buffer_page(struct buffer_head *bh)
goto nope;
/* OK, it's a truncated page */
- if (TestSetPageLocked(page))
+ if (!trylock_page(page))
goto nope;
page_cache_get(page);
@@ -78,6 +78,19 @@ nope:
}
/*
+ * Decrement reference counter for data buffer. If it has been marked
+ * 'BH_Freed', release it and the page to which it belongs if possible.
+ */
+static void release_data_buffer(struct buffer_head *bh)
+{
+ if (buffer_freed(bh)) {
+ clear_buffer_freed(bh);
+ release_buffer_page(bh);
+ } else
+ put_bh(bh);
+}
+
+/*
* Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
* held. For ranking reasons we must trylock. If we lose, schedule away and
* return 0. j_list_lock is dropped in this case.
@@ -172,7 +185,7 @@ static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
/*
* Submit all the data buffers to disk
*/
-static void journal_submit_data_buffers(journal_t *journal,
+static int journal_submit_data_buffers(journal_t *journal,
transaction_t *commit_transaction)
{
struct journal_head *jh;
@@ -180,6 +193,7 @@ static void journal_submit_data_buffers(journal_t *journal,
int locked;
int bufs = 0;
struct buffer_head **wbuf = journal->j_wbuf;
+ int err = 0;
/*
* Whenever we unlock the journal and sleep, things can get added
@@ -207,7 +221,7 @@ write_out_data:
* blocking lock_buffer().
*/
if (buffer_dirty(bh)) {
- if (test_set_buffer_locked(bh)) {
+ if (!trylock_buffer(bh)) {
BUFFER_TRACE(bh, "needs blocking lock");
spin_unlock(&journal->j_list_lock);
/* Write out all data to prevent deadlocks */
@@ -231,7 +245,7 @@ write_out_data:
if (locked)
unlock_buffer(bh);
BUFFER_TRACE(bh, "already cleaned up");
- put_bh(bh);
+ release_data_buffer(bh);
continue;
}
if (locked && test_clear_buffer_dirty(bh)) {
@@ -253,15 +267,17 @@ write_out_data:
put_bh(bh);
} else {
BUFFER_TRACE(bh, "writeout complete: unfile");
+ if (unlikely(!buffer_uptodate(bh)))
+ err = -EIO;
__journal_unfile_buffer(jh);
jbd_unlock_bh_state(bh);
if (locked)
unlock_buffer(bh);
journal_remove_journal_head(bh);
- /* Once for our safety reference, once for
+ /* One for our safety reference, other for
* journal_remove_journal_head() */
put_bh(bh);
- put_bh(bh);
+ release_data_buffer(bh);
}
if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
@@ -271,6 +287,8 @@ write_out_data:
}
spin_unlock(&journal->j_list_lock);
journal_do_submit_data(wbuf, bufs);
+
+ return err;
}
/*
@@ -410,8 +428,7 @@ void journal_commit_transaction(journal_t *journal)
* Now start flushing things to disk, in the order they appear
* on the transaction lists. Data blocks go first.
*/
- err = 0;
- journal_submit_data_buffers(journal, commit_transaction);
+ err = journal_submit_data_buffers(journal, commit_transaction);
/*
* Wait for all previously submitted IO to complete.
@@ -426,10 +443,21 @@ void journal_commit_transaction(journal_t *journal)
if (buffer_locked(bh)) {
spin_unlock(&journal->j_list_lock);
wait_on_buffer(bh);
- if (unlikely(!buffer_uptodate(bh)))
- err = -EIO;
spin_lock(&journal->j_list_lock);
}
+ if (unlikely(!buffer_uptodate(bh))) {
+ if (!trylock_page(bh->b_page)) {
+ spin_unlock(&journal->j_list_lock);
+ lock_page(bh->b_page);
+ spin_lock(&journal->j_list_lock);
+ }
+ if (bh->b_page->mapping)
+ set_bit(AS_EIO, &bh->b_page->mapping->flags);
+
+ unlock_page(bh->b_page);
+ SetPageError(bh->b_page);
+ err = -EIO;
+ }
if (!inverted_lock(journal, bh)) {
put_bh(bh);
spin_lock(&journal->j_list_lock);
@@ -443,17 +471,21 @@ void journal_commit_transaction(journal_t *journal)
} else {
jbd_unlock_bh_state(bh);
}
- put_bh(bh);
+ release_data_buffer(bh);
cond_resched_lock(&journal->j_list_lock);
}
spin_unlock(&journal->j_list_lock);
- if (err)
- journal_abort(journal, err);
+ if (err) {
+ char b[BDEVNAME_SIZE];
- journal_write_revoke_records(journal, commit_transaction);
+ printk(KERN_WARNING
+ "JBD: Detected IO errors while flushing file data "
+ "on %s\n", bdevname(journal->j_fs_dev, b));
+ err = 0;
+ }
- jbd_debug(3, "JBD: commit phase 2\n");
+ journal_write_revoke_records(journal, commit_transaction);
/*
* If we found any dirty or locked buffers, then we should have
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index b99c3b3654c..aa7143a8349 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -68,7 +68,6 @@ EXPORT_SYMBOL(journal_set_features);
EXPORT_SYMBOL(journal_create);
EXPORT_SYMBOL(journal_load);
EXPORT_SYMBOL(journal_destroy);
-EXPORT_SYMBOL(journal_update_superblock);
EXPORT_SYMBOL(journal_abort);
EXPORT_SYMBOL(journal_errno);
EXPORT_SYMBOL(journal_ack_err);
@@ -1636,9 +1635,10 @@ static int journal_init_journal_head_cache(void)
static void journal_destroy_journal_head_cache(void)
{
- J_ASSERT(journal_head_cache != NULL);
- kmem_cache_destroy(journal_head_cache);
- journal_head_cache = NULL;
+ if (journal_head_cache) {
+ kmem_cache_destroy(journal_head_cache);
+ journal_head_cache = NULL;
+ }
}
/*
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index 1bb43e987f4..c7bd649bbbd 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -166,138 +166,123 @@ static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal,
return NULL;
}
+void journal_destroy_revoke_caches(void)
+{
+ if (revoke_record_cache) {
+ kmem_cache_destroy(revoke_record_cache);
+ revoke_record_cache = NULL;
+ }
+ if (revoke_table_cache) {
+ kmem_cache_destroy(revoke_table_cache);
+ revoke_table_cache = NULL;
+ }
+}
+
int __init journal_init_revoke_caches(void)
{
+ J_ASSERT(!revoke_record_cache);
+ J_ASSERT(!revoke_table_cache);
+
revoke_record_cache = kmem_cache_create("revoke_record",
sizeof(struct jbd_revoke_record_s),
0,
SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
NULL);
if (!revoke_record_cache)
- return -ENOMEM;
+ goto record_cache_failure;
revoke_table_cache = kmem_cache_create("revoke_table",
sizeof(struct jbd_revoke_table_s),
0, SLAB_TEMPORARY, NULL);
- if (!revoke_table_cache) {
- kmem_cache_destroy(revoke_record_cache);
- revoke_record_cache = NULL;
- return -ENOMEM;
- }
+ if (!revoke_table_cache)
+ goto table_cache_failure;
+
return 0;
-}
-void journal_destroy_revoke_caches(void)
-{
- kmem_cache_destroy(revoke_record_cache);
- revoke_record_cache = NULL;
- kmem_cache_destroy(revoke_table_cache);
- revoke_table_cache = NULL;
+table_cache_failure:
+ journal_destroy_revoke_caches();
+record_cache_failure:
+ return -ENOMEM;
}
-/* Initialise the revoke table for a given journal to a given size. */
-
-int journal_init_revoke(journal_t *journal, int hash_size)
+static struct jbd_revoke_table_s *journal_init_revoke_table(int hash_size)
{
- int shift, tmp;
+ int shift = 0;
+ int tmp = hash_size;
+ struct jbd_revoke_table_s *table;
- J_ASSERT (journal->j_revoke_table[0] == NULL);
+ table = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL);
+ if (!table)
+ goto out;
- shift = 0;
- tmp = hash_size;
while((tmp >>= 1UL) != 0UL)
shift++;
- journal->j_revoke_table[0] = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL);
- if (!journal->j_revoke_table[0])
- return -ENOMEM;
- journal->j_revoke = journal->j_revoke_table[0];
-
- /* Check that the hash_size is a power of two */
- J_ASSERT(is_power_of_2(hash_size));
-
- journal->j_revoke->hash_size = hash_size;
-
- journal->j_revoke->hash_shift = shift;
-
- journal->j_revoke->hash_table =
+ table->hash_size = hash_size;
+ table->hash_shift = shift;
+ table->hash_table =
kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
- if (!journal->j_revoke->hash_table) {
- kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]);
- journal->j_revoke = NULL;
- return -ENOMEM;
+ if (!table->hash_table) {
+ kmem_cache_free(revoke_table_cache, table);
+ table = NULL;
+ goto out;
}
for (tmp = 0; tmp < hash_size; tmp++)
- INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]);
+ INIT_LIST_HEAD(&table->hash_table[tmp]);
- journal->j_revoke_table[1] = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL);
- if (!journal->j_revoke_table[1]) {
- kfree(journal->j_revoke_table[0]->hash_table);
- kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]);
- return -ENOMEM;
+out:
+ return table;
+}
+
+static void journal_destroy_revoke_table(struct jbd_revoke_table_s *table)
+{
+ int i;
+ struct list_head *hash_list;
+
+ for (i = 0; i < table->hash_size; i++) {
+ hash_list = &table->hash_table[i];
+ J_ASSERT(list_empty(hash_list));
}
- journal->j_revoke = journal->j_revoke_table[1];
+ kfree(table->hash_table);
+ kmem_cache_free(revoke_table_cache, table);
+}
- /* Check that the hash_size is a power of two */
+/* Initialise the revoke table for a given journal to a given size. */
+int journal_init_revoke(journal_t *journal, int hash_size)
+{
+ J_ASSERT(journal->j_revoke_table[0] == NULL);
J_ASSERT(is_power_of_2(hash_size));
- journal->j_revoke->hash_size = hash_size;
+ journal->j_revoke_table[0] = journal_init_revoke_table(hash_size);
+ if (!journal->j_revoke_table[0])
+ goto fail0;
- journal->j_revoke->hash_shift = shift;
+ journal->j_revoke_table[1] = journal_init_revoke_table(hash_size);
+ if (!journal->j_revoke_table[1])
+ goto fail1;
- journal->j_revoke->hash_table =
- kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
- if (!journal->j_revoke->hash_table) {
- kfree(journal->j_revoke_table[0]->hash_table);
- kmem_cache_free(revoke_table_cache, journal->j_revoke_table[0]);
- kmem_cache_free(revoke_table_cache, journal->j_revoke_table[1]);
- journal->j_revoke = NULL;
- return -ENOMEM;
- }
-
- for (tmp = 0; tmp < hash_size; tmp++)
- INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]);
+ journal->j_revoke = journal->j_revoke_table[1];
spin_lock_init(&journal->j_revoke_lock);
return 0;
-}
-/* Destoy a journal's revoke table. The table must already be empty! */
+fail1:
+ journal_destroy_revoke_table(journal->j_revoke_table[0]);
+fail0:
+ return -ENOMEM;
+}
+/* Destroy a journal's revoke table. The table must already be empty! */
void journal_destroy_revoke(journal_t *journal)
{
- struct jbd_revoke_table_s *table;
- struct list_head *hash_list;
- int i;
-
- table = journal->j_revoke_table[0];
- if (!table)
- return;
-
- for (i=0; i<table->hash_size; i++) {
- hash_list = &table->hash_table[i];
- J_ASSERT (list_empty(hash_list));
- }
-
- kfree(table->hash_table);
- kmem_cache_free(revoke_table_cache, table);
- journal->j_revoke = NULL;
-
- table = journal->j_revoke_table[1];
- if (!table)
- return;
-
- for (i=0; i<table->hash_size; i++) {
- hash_list = &table->hash_table[i];
- J_ASSERT (list_empty(hash_list));
- }
-
- kfree(table->hash_table);
- kmem_cache_free(revoke_table_cache, table);
journal->j_revoke = NULL;
+ if (journal->j_revoke_table[0])
+ journal_destroy_revoke_table(journal->j_revoke_table[0]);
+ if (journal->j_revoke_table[1])
+ journal_destroy_revoke_table(journal->j_revoke_table[1]);
}
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 67ff2024c23..0540ca27a44 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -291,7 +291,7 @@ handle_t *journal_start(journal_t *journal, int nblocks)
goto out;
}
- lock_acquire(&handle->h_lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+ lock_map_acquire(&handle->h_lockdep_map);
out:
return handle;
@@ -1448,7 +1448,7 @@ int journal_stop(handle_t *handle)
spin_unlock(&journal->j_state_lock);
}
- lock_release(&handle->h_lockdep_map, 1, _THIS_IP_);
+ lock_map_release(&handle->h_lockdep_map);
jbd_free_handle(handle);
return err;
@@ -1648,12 +1648,42 @@ out:
return;
}
+/*
+ * journal_try_to_free_buffers() could race with journal_commit_transaction()
+ * The latter might still hold the a count on buffers when inspecting
+ * them on t_syncdata_list or t_locked_list.
+ *
+ * journal_try_to_free_buffers() will call this function to
+ * wait for the current transaction to finish syncing data buffers, before
+ * tryinf to free that buffer.
+ *
+ * Called with journal->j_state_lock held.
+ */
+static void journal_wait_for_transaction_sync_data(journal_t *journal)
+{
+ transaction_t *transaction = NULL;
+ tid_t tid;
+
+ spin_lock(&journal->j_state_lock);
+ transaction = journal->j_committing_transaction;
+
+ if (!transaction) {
+ spin_unlock(&journal->j_state_lock);
+ return;
+ }
+
+ tid = transaction->t_tid;
+ spin_unlock(&journal->j_state_lock);
+ log_wait_commit(journal, tid);
+}
/**
* int journal_try_to_free_buffers() - try to free page buffers.
* @journal: journal for operation
* @page: to try and free
- * @unused_gfp_mask: unused
+ * @gfp_mask: we use the mask to detect how hard should we try to release
+ * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
+ * release the buffers.
*
*
* For all the buffers on this page,
@@ -1682,9 +1712,11 @@ out:
* journal_try_to_free_buffer() is changing its state. But that
* cannot happen because we never reallocate freed data as metadata
* while the data is part of a transaction. Yes?
+ *
+ * Return 0 on failure, 1 on success
*/
int journal_try_to_free_buffers(journal_t *journal,
- struct page *page, gfp_t unused_gfp_mask)
+ struct page *page, gfp_t gfp_mask)
{
struct buffer_head *head;
struct buffer_head *bh;
@@ -1713,7 +1745,28 @@ int journal_try_to_free_buffers(journal_t *journal,
if (buffer_jbd(bh))
goto busy;
} while ((bh = bh->b_this_page) != head);
+
ret = try_to_free_buffers(page);
+
+ /*
+ * There are a number of places where journal_try_to_free_buffers()
+ * could race with journal_commit_transaction(), the later still
+ * holds the reference to the buffers to free while processing them.
+ * try_to_free_buffers() failed to free those buffers. Some of the
+ * caller of releasepage() request page buffers to be dropped, otherwise
+ * treat the fail-to-free as errors (such as generic_file_direct_IO())
+ *
+ * So, if the caller of try_to_release_page() wants the synchronous
+ * behaviour(i.e make sure buffers are dropped upon return),
+ * let's wait for the current transaction to finish flush of
+ * dirty data buffers, then try to free those buffers again,
+ * with the journal locked.
+ */
+ if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) {
+ journal_wait_for_transaction_sync_data(journal);
+ ret = try_to_free_buffers(page);
+ }
+
busy:
return ret;
}
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 6914598022c..9203c3332f1 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -20,6 +20,7 @@
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/jbd2.h>
+#include <linux/marker.h>
#include <linux/errno.h>
#include <linux/slab.h>
@@ -93,7 +94,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
int ret = 0;
struct buffer_head *bh = jh2bh(jh);
- if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
+ if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
+ !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
JBUFFER_TRACE(jh, "remove from checkpoint list");
ret = __jbd2_journal_remove_checkpoint(jh) + 1;
jbd_unlock_bh_state(bh);
@@ -126,14 +128,29 @@ void __jbd2_log_wait_for_space(journal_t *journal)
/*
* Test again, another process may have checkpointed while we
- * were waiting for the checkpoint lock
+ * were waiting for the checkpoint lock. If there are no
+ * outstanding transactions there is nothing to checkpoint and
+ * we can't make progress. Abort the journal in this case.
*/
spin_lock(&journal->j_state_lock);
+ spin_lock(&journal->j_list_lock);
nblocks = jbd_space_needed(journal);
if (__jbd2_log_space_left(journal) < nblocks) {
+ int chkpt = journal->j_checkpoint_transactions != NULL;
+
+ spin_unlock(&journal->j_list_lock);
spin_unlock(&journal->j_state_lock);
- jbd2_log_do_checkpoint(journal);
+ if (chkpt) {
+ jbd2_log_do_checkpoint(journal);
+ } else {
+ printk(KERN_ERR "%s: no transactions\n",
+ __func__);
+ jbd2_journal_abort(journal, 0);
+ }
+
spin_lock(&journal->j_state_lock);
+ } else {
+ spin_unlock(&journal->j_list_lock);
}
mutex_unlock(&journal->j_checkpoint_mutex);
}
@@ -160,21 +177,25 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
* buffers. Note that we take the buffers in the opposite ordering
* from the one in which they were submitted for IO.
*
+ * Return 0 on success, and return <0 if some buffers have failed
+ * to be written out.
+ *
* Called with j_list_lock held.
*/
-static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
+static int __wait_cp_io(journal_t *journal, transaction_t *transaction)
{
struct journal_head *jh;
struct buffer_head *bh;
tid_t this_tid;
int released = 0;
+ int ret = 0;
this_tid = transaction->t_tid;
restart:
/* Did somebody clean up the transaction in the meanwhile? */
if (journal->j_checkpoint_transactions != transaction ||
transaction->t_tid != this_tid)
- return;
+ return ret;
while (!released && transaction->t_checkpoint_io_list) {
jh = transaction->t_checkpoint_io_list;
bh = jh2bh(jh);
@@ -194,6 +215,9 @@ restart:
spin_lock(&journal->j_list_lock);
goto restart;
}
+ if (unlikely(buffer_write_io_error(bh)))
+ ret = -EIO;
+
/*
* Now in whatever state the buffer currently is, we know that
* it has been written out and so we can drop it from the list
@@ -203,6 +227,8 @@ restart:
jbd2_journal_remove_journal_head(bh);
__brelse(bh);
}
+
+ return ret;
}
#define NR_BATCH 64
@@ -226,7 +252,8 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
* Try to flush one buffer from the checkpoint list to disk.
*
* Return 1 if something happened which requires us to abort the current
- * scan of the checkpoint list.
+ * scan of the checkpoint list. Return <0 if the buffer has failed to
+ * be written out.
*
* Called with j_list_lock held and drops it if 1 is returned
* Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
@@ -258,6 +285,9 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
jbd2_log_wait_commit(journal, tid);
ret = 1;
} else if (!buffer_dirty(bh)) {
+ ret = 1;
+ if (unlikely(buffer_write_io_error(bh)))
+ ret = -EIO;
J_ASSERT_JH(jh, !buffer_jbddirty(bh));
BUFFER_TRACE(bh, "remove from checkpoint");
__jbd2_journal_remove_checkpoint(jh);
@@ -265,7 +295,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
jbd_unlock_bh_state(bh);
jbd2_journal_remove_journal_head(bh);
__brelse(bh);
- ret = 1;
} else {
/*
* Important: we are about to write the buffer, and
@@ -298,6 +327,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
* to disk. We submit larger chunks of data at once.
*
* The journal should be locked before calling this function.
+ * Called with j_checkpoint_mutex held.
*/
int jbd2_log_do_checkpoint(journal_t *journal)
{
@@ -313,6 +343,8 @@ int jbd2_log_do_checkpoint(journal_t *journal)
* journal straight away.
*/
result = jbd2_cleanup_journal_tail(journal);
+ trace_mark(jbd2_checkpoint, "dev %s need_checkpoint %d",
+ journal->j_devname, result);
jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
if (result <= 0)
return result;
@@ -321,6 +353,7 @@ int jbd2_log_do_checkpoint(journal_t *journal)
* OK, we need to start writing disk blocks. Take one transaction
* and write it.
*/
+ result = 0;
spin_lock(&journal->j_list_lock);
if (!journal->j_checkpoint_transactions)
goto out;
@@ -339,7 +372,7 @@ restart:
int batch_count = 0;
struct buffer_head *bhs[NR_BATCH];
struct journal_head *jh;
- int retry = 0;
+ int retry = 0, err;
while (!retry && transaction->t_checkpoint_list) {
struct buffer_head *bh;
@@ -353,6 +386,8 @@ restart:
}
retry = __process_buffer(journal, jh, bhs, &batch_count,
transaction);
+ if (retry < 0 && !result)
+ result = retry;
if (!retry && (need_resched() ||
spin_needbreak(&journal->j_list_lock))) {
spin_unlock(&journal->j_list_lock);
@@ -377,14 +412,18 @@ restart:
* Now we have cleaned up the first transaction's checkpoint
* list. Let's clean up the second one
*/
- __wait_cp_io(journal, transaction);
+ err = __wait_cp_io(journal, transaction);
+ if (!result)
+ result = err;
}
out:
spin_unlock(&journal->j_list_lock);
- result = jbd2_cleanup_journal_tail(journal);
if (result < 0)
- return result;
- return 0;
+ jbd2_journal_abort(journal, result);
+ else
+ result = jbd2_cleanup_journal_tail(journal);
+
+ return (result < 0) ? result : 0;
}
/*
@@ -400,8 +439,9 @@ out:
* This is the only part of the journaling code which really needs to be
* aware of transaction aborts. Checkpointing involves writing to the
* main filesystem area rather than to the journal, so it can proceed
- * even in abort state, but we must not update the journal superblock if
- * we have an abort error outstanding.
+ * even in abort state, but we must not update the super block if
+ * checkpointing may have failed. Otherwise, we would lose some metadata
+ * buffers which should be written-back to the filesystem.
*/
int jbd2_cleanup_journal_tail(journal_t *journal)
@@ -410,6 +450,9 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
tid_t first_tid;
unsigned long blocknr, freed;
+ if (is_journal_aborted(journal))
+ return 1;
+
/* OK, work out the oldest transaction remaining in the log, and
* the log block it starts at.
*
@@ -688,7 +731,6 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
J_ASSERT(transaction->t_state == T_FINISHED);
J_ASSERT(transaction->t_buffers == NULL);
- J_ASSERT(transaction->t_sync_datalist == NULL);
J_ASSERT(transaction->t_forget == NULL);
J_ASSERT(transaction->t_iobuf_list == NULL);
J_ASSERT(transaction->t_shadow_list == NULL);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index a2ed72f7cee..0abe02c4242 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -16,12 +16,15 @@
#include <linux/time.h>
#include <linux/fs.h>
#include <linux/jbd2.h>
+#include <linux/marker.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/jiffies.h>
#include <linux/crc32.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
/*
* Default IO end handler for temporary BJ_IO buffer_heads.
@@ -37,8 +40,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
}
/*
- * When an ext3-ordered file is truncated, it is possible that many pages are
- * not sucessfully freed, because they are attached to a committing transaction.
+ * When an ext4 file is truncated, it is possible that some pages are not
+ * successfully freed, because they are attached to a committing transaction.
* After the transaction commits, these pages are left on the LRU, with no
* ->mapping, and with attached buffers. These pages are trivially reclaimable
* by the VM, but their apparent absence upsets the VM accounting, and it makes
@@ -65,7 +68,7 @@ static void release_buffer_page(struct buffer_head *bh)
goto nope;
/* OK, it's a truncated page */
- if (TestSetPageLocked(page))
+ if (!trylock_page(page))
goto nope;
page_cache_get(page);
@@ -80,21 +83,6 @@ nope:
}
/*
- * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
- * held. For ranking reasons we must trylock. If we lose, schedule away and
- * return 0. j_list_lock is dropped in this case.
- */
-static int inverted_lock(journal_t *journal, struct buffer_head *bh)
-{
- if (!jbd_trylock_bh_state(bh)) {
- spin_unlock(&journal->j_list_lock);
- schedule();
- return 0;
- }
- return 1;
-}
-
-/*
* Done it all: now submit the commit record. We should have
* cleaned up our previous buffers by now, so if we are in abort
* mode we can now just skip the rest of the journal write
@@ -112,6 +100,7 @@ static int journal_submit_commit_record(journal_t *journal,
struct buffer_head *bh;
int ret;
int barrier_done = 0;
+ struct timespec now = current_kernel_time();
if (is_journal_aborted(journal))
return 0;
@@ -126,6 +115,8 @@ static int journal_submit_commit_record(journal_t *journal,
tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
+ tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
+ tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
if (JBD2_HAS_COMPAT_FEATURE(journal,
JBD2_FEATURE_COMPAT_CHECKSUM)) {
@@ -136,8 +127,7 @@ static int journal_submit_commit_record(journal_t *journal,
JBUFFER_TRACE(descriptor, "submit commit block");
lock_buffer(bh);
- get_bh(bh);
- set_buffer_dirty(bh);
+ clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
bh->b_end_io = journal_end_buffer_io_sync;
@@ -157,12 +147,9 @@ static int journal_submit_commit_record(journal_t *journal,
* to remember if we sent a barrier request
*/
if (ret == -EOPNOTSUPP && barrier_done) {
- char b[BDEVNAME_SIZE];
-
printk(KERN_WARNING
- "JBD: barrier-based sync failed on %s - "
- "disabling barriers\n",
- bdevname(journal->j_dev, b));
+ "JBD: barrier-based sync failed on %s - "
+ "disabling barriers\n", journal->j_devname);
spin_lock(&journal->j_state_lock);
journal->j_flags &= ~JBD2_BARRIER;
spin_unlock(&journal->j_state_lock);
@@ -170,7 +157,7 @@ static int journal_submit_commit_record(journal_t *journal,
/* And try again, without the barrier */
lock_buffer(bh);
set_buffer_uptodate(bh);
- set_buffer_dirty(bh);
+ clear_buffer_dirty(bh);
ret = submit_bh(WRITE, bh);
}
*cbh = bh;
@@ -197,159 +184,114 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
}
/*
- * Wait for all submitted IO to complete.
+ * write the filemap data using writepage() address_space_operations.
+ * We don't do block allocation here even for delalloc. We don't
+ * use writepages() because with dealyed allocation we may be doing
+ * block allocation in writepages().
*/
-static int journal_wait_on_locked_list(journal_t *journal,
- transaction_t *commit_transaction)
+static int journal_submit_inode_data_buffers(struct address_space *mapping)
{
- int ret = 0;
- struct journal_head *jh;
-
- while (commit_transaction->t_locked_list) {
- struct buffer_head *bh;
-
- jh = commit_transaction->t_locked_list->b_tprev;
- bh = jh2bh(jh);
- get_bh(bh);
- if (buffer_locked(bh)) {
- spin_unlock(&journal->j_list_lock);
- wait_on_buffer(bh);
- if (unlikely(!buffer_uptodate(bh)))
- ret = -EIO;
- spin_lock(&journal->j_list_lock);
- }
- if (!inverted_lock(journal, bh)) {
- put_bh(bh);
- spin_lock(&journal->j_list_lock);
- continue;
- }
- if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
- __jbd2_journal_unfile_buffer(jh);
- jbd_unlock_bh_state(bh);
- jbd2_journal_remove_journal_head(bh);
- put_bh(bh);
- } else {
- jbd_unlock_bh_state(bh);
- }
- put_bh(bh);
- cond_resched_lock(&journal->j_list_lock);
- }
+ int ret;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_ALL,
+ .nr_to_write = mapping->nrpages * 2,
+ .range_start = 0,
+ .range_end = i_size_read(mapping->host),
+ .for_writepages = 1,
+ };
+
+ ret = generic_writepages(mapping, &wbc);
return ret;
- }
+}
-static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
+/*
+ * Submit all the data buffers of inode associated with the transaction to
+ * disk.
+ *
+ * We are in a committing transaction. Therefore no new inode can be added to
+ * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
+ * operate on from being released while we write out pages.
+ */
+static int journal_submit_data_buffers(journal_t *journal,
+ transaction_t *commit_transaction)
{
- int i;
+ struct jbd2_inode *jinode;
+ int err, ret = 0;
+ struct address_space *mapping;
- for (i = 0; i < bufs; i++) {
- wbuf[i]->b_end_io = end_buffer_write_sync;
- /* We use-up our safety reference in submit_bh() */
- submit_bh(WRITE, wbuf[i]);
+ spin_lock(&journal->j_list_lock);
+ list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+ mapping = jinode->i_vfs_inode->i_mapping;
+ jinode->i_flags |= JI_COMMIT_RUNNING;
+ spin_unlock(&journal->j_list_lock);
+ /*
+ * submit the inode data buffers. We use writepage
+ * instead of writepages. Because writepages can do
+ * block allocation with delalloc. We need to write
+ * only allocated blocks here.
+ */
+ err = journal_submit_inode_data_buffers(mapping);
+ if (!ret)
+ ret = err;
+ spin_lock(&journal->j_list_lock);
+ J_ASSERT(jinode->i_transaction == commit_transaction);
+ jinode->i_flags &= ~JI_COMMIT_RUNNING;
+ wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
}
+ spin_unlock(&journal->j_list_lock);
+ return ret;
}
/*
- * Submit all the data buffers to disk
+ * Wait for data submitted for writeout, refile inodes to proper
+ * transaction if needed.
+ *
*/
-static void journal_submit_data_buffers(journal_t *journal,
- transaction_t *commit_transaction)
+static int journal_finish_inode_data_buffers(journal_t *journal,
+ transaction_t *commit_transaction)
{
- struct journal_head *jh;
- struct buffer_head *bh;
- int locked;
- int bufs = 0;
- struct buffer_head **wbuf = journal->j_wbuf;
+ struct jbd2_inode *jinode, *next_i;
+ int err, ret = 0;
- /*
- * Whenever we unlock the journal and sleep, things can get added
- * onto ->t_sync_datalist, so we have to keep looping back to
- * write_out_data until we *know* that the list is empty.
- *
- * Cleanup any flushed data buffers from the data list. Even in
- * abort mode, we want to flush this out as soon as possible.
- */
-write_out_data:
- cond_resched();
+ /* For locking, see the comment in journal_submit_data_buffers() */
spin_lock(&journal->j_list_lock);
-
- while (commit_transaction->t_sync_datalist) {
- jh = commit_transaction->t_sync_datalist;
- bh = jh2bh(jh);
- locked = 0;
-
- /* Get reference just to make sure buffer does not disappear
- * when we are forced to drop various locks */
- get_bh(bh);
- /* If the buffer is dirty, we need to submit IO and hence
- * we need the buffer lock. We try to lock the buffer without
- * blocking. If we fail, we need to drop j_list_lock and do
- * blocking lock_buffer().
- */
- if (buffer_dirty(bh)) {
- if (test_set_buffer_locked(bh)) {
- BUFFER_TRACE(bh, "needs blocking lock");
- spin_unlock(&journal->j_list_lock);
- /* Write out all data to prevent deadlocks */
- journal_do_submit_data(wbuf, bufs);
- bufs = 0;
- lock_buffer(bh);
- spin_lock(&journal->j_list_lock);
- }
- locked = 1;
- }
- /* We have to get bh_state lock. Again out of order, sigh. */
- if (!inverted_lock(journal, bh)) {
- jbd_lock_bh_state(bh);
- spin_lock(&journal->j_list_lock);
- }
- /* Someone already cleaned up the buffer? */
- if (!buffer_jbd(bh)
- || jh->b_transaction != commit_transaction
- || jh->b_jlist != BJ_SyncData) {
- jbd_unlock_bh_state(bh);
- if (locked)
- unlock_buffer(bh);
- BUFFER_TRACE(bh, "already cleaned up");
- put_bh(bh);
- continue;
- }
- if (locked && test_clear_buffer_dirty(bh)) {
- BUFFER_TRACE(bh, "needs writeout, adding to array");
- wbuf[bufs++] = bh;
- __jbd2_journal_file_buffer(jh, commit_transaction,
- BJ_Locked);
- jbd_unlock_bh_state(bh);
- if (bufs == journal->j_wbufsize) {
- spin_unlock(&journal->j_list_lock);
- journal_do_submit_data(wbuf, bufs);
- bufs = 0;
- goto write_out_data;
- }
- } else if (!locked && buffer_locked(bh)) {
- __jbd2_journal_file_buffer(jh, commit_transaction,
- BJ_Locked);
- jbd_unlock_bh_state(bh);
- put_bh(bh);
- } else {
- BUFFER_TRACE(bh, "writeout complete: unfile");
- __jbd2_journal_unfile_buffer(jh);
- jbd_unlock_bh_state(bh);
- if (locked)
- unlock_buffer(bh);
- jbd2_journal_remove_journal_head(bh);
- /* Once for our safety reference, once for
- * jbd2_journal_remove_journal_head() */
- put_bh(bh);
- put_bh(bh);
+ list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+ jinode->i_flags |= JI_COMMIT_RUNNING;
+ spin_unlock(&journal->j_list_lock);
+ err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
+ if (err) {
+ /*
+ * Because AS_EIO is cleared by
+ * wait_on_page_writeback_range(), set it again so
+ * that user process can get -EIO from fsync().
+ */
+ set_bit(AS_EIO,
+ &jinode->i_vfs_inode->i_mapping->flags);
+
+ if (!ret)
+ ret = err;
}
+ spin_lock(&journal->j_list_lock);
+ jinode->i_flags &= ~JI_COMMIT_RUNNING;
+ wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
+ }
- if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
- spin_unlock(&journal->j_list_lock);
- goto write_out_data;
+ /* Now refile inode to proper lists */
+ list_for_each_entry_safe(jinode, next_i,
+ &commit_transaction->t_inode_list, i_list) {
+ list_del(&jinode->i_list);
+ if (jinode->i_next_transaction) {
+ jinode->i_transaction = jinode->i_next_transaction;
+ jinode->i_next_transaction = NULL;
+ list_add(&jinode->i_list,
+ &jinode->i_transaction->t_inode_list);
+ } else {
+ jinode->i_transaction = NULL;
}
}
spin_unlock(&journal->j_list_lock);
- journal_do_submit_data(wbuf, bufs);
+
+ return ret;
}
static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
@@ -426,6 +368,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
commit_transaction = journal->j_running_transaction;
J_ASSERT(commit_transaction->t_state == T_RUNNING);
+ trace_mark(jbd2_start_commit, "dev %s transaction %d",
+ journal->j_devname, commit_transaction->t_tid);
jbd_debug(1, "JBD: starting commit of transaction %d\n",
commit_transaction->t_tid);
@@ -524,21 +468,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
* Now start flushing things to disk, in the order they appear
* on the transaction lists. Data blocks go first.
*/
- err = 0;
- journal_submit_data_buffers(journal, commit_transaction);
-
- /*
- * Wait for all previously submitted IO to complete if commit
- * record is to be written synchronously.
- */
- spin_lock(&journal->j_list_lock);
- if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
- err = journal_wait_on_locked_list(journal,
- commit_transaction);
-
- spin_unlock(&journal->j_list_lock);
-
+ err = journal_submit_data_buffers(journal, commit_transaction);
if (err)
jbd2_journal_abort(journal, err);
@@ -547,16 +477,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
jbd_debug(3, "JBD: commit phase 2\n");
/*
- * If we found any dirty or locked buffers, then we should have
- * looped back up to the write_out_data label. If there weren't
- * any then journal_clean_data_list should have wiped the list
- * clean by now, so check that it is in fact empty.
- */
- J_ASSERT (commit_transaction->t_sync_datalist == NULL);
-
- jbd_debug (3, "JBD: commit phase 3\n");
-
- /*
* Way to go: we have now written out all of the data for a
* transaction! Now comes the tricky part: we need to write out
* metadata. Loop over the transaction's entire buffer list:
@@ -574,6 +494,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
J_ASSERT(commit_transaction->t_nr_buffers <=
commit_transaction->t_outstanding_credits);
+ err = 0;
descriptor = NULL;
bufs = 0;
while (commit_transaction->t_buffers) {
@@ -583,9 +504,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
jh = commit_transaction->t_buffers;
/* If we're in abort mode, we just un-journal the buffer and
- release it for background writing. */
+ release it. */
if (is_journal_aborted(journal)) {
+ clear_buffer_jbddirty(jh2bh(jh));
JBUFFER_TRACE(jh, "journal is aborting: refile");
jbd2_journal_refile_buffer(journal, jh);
/* If that was the last one, we need to clean up
@@ -748,13 +670,23 @@ start_journal_io:
&cbh, crc32_sum);
if (err)
__jbd2_journal_abort_hard(journal);
+ }
- spin_lock(&journal->j_list_lock);
- err = journal_wait_on_locked_list(journal,
- commit_transaction);
- spin_unlock(&journal->j_list_lock);
- if (err)
- __jbd2_journal_abort_hard(journal);
+ /*
+ * This is the right place to wait for data buffers both for ASYNC
+ * and !ASYNC commit. If commit is ASYNC, we need to wait only after
+ * the commit block went to disk (which happens above). If commit is
+ * SYNC, we need to wait for data buffers before we start writing
+ * commit block, which happens below in such setting.
+ */
+ err = journal_finish_inode_data_buffers(journal, commit_transaction);
+ if (err) {
+ printk(KERN_WARNING
+ "JBD2: Detected IO errors while flushing file data "
+ "on %s\n", journal->j_devname);
+ if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
+ jbd2_journal_abort(journal, err);
+ err = 0;
}
/* Lo and behold: we have just managed to send a transaction to
@@ -768,7 +700,7 @@ start_journal_io:
so we incur less scheduling load.
*/
- jbd_debug(3, "JBD: commit phase 4\n");
+ jbd_debug(3, "JBD: commit phase 3\n");
/*
* akpm: these are BJ_IO, and j_list_lock is not needed.
@@ -827,7 +759,7 @@ wait_for_iobuf:
J_ASSERT (commit_transaction->t_shadow_list == NULL);
- jbd_debug(3, "JBD: commit phase 5\n");
+ jbd_debug(3, "JBD: commit phase 4\n");
/* Here we wait for the revoke record and descriptor record buffers */
wait_for_ctlbuf:
@@ -854,7 +786,10 @@ wait_for_iobuf:
/* AKPM: bforget here */
}
- jbd_debug(3, "JBD: commit phase 6\n");
+ if (err)
+ jbd2_journal_abort(journal, err);
+
+ jbd_debug(3, "JBD: commit phase 5\n");
if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
@@ -874,9 +809,9 @@ wait_for_iobuf:
transaction can be removed from any checkpoint list it was on
before. */
- jbd_debug(3, "JBD: commit phase 7\n");
+ jbd_debug(3, "JBD: commit phase 6\n");
- J_ASSERT(commit_transaction->t_sync_datalist == NULL);
+ J_ASSERT(list_empty(&commit_transaction->t_inode_list));
J_ASSERT(commit_transaction->t_buffers == NULL);
J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
J_ASSERT(commit_transaction->t_iobuf_list == NULL);
@@ -952,6 +887,8 @@ restart_loop:
if (buffer_jbddirty(bh)) {
JBUFFER_TRACE(jh, "add to new checkpointing trans");
__jbd2_journal_insert_checkpoint(jh, commit_transaction);
+ if (is_journal_aborted(journal))
+ clear_buffer_jbddirty(bh);
JBUFFER_TRACE(jh, "refile for checkpoint writeback");
__jbd2_journal_refile_buffer(jh);
jbd_unlock_bh_state(bh);
@@ -997,7 +934,7 @@ restart_loop:
/* Done with this transaction! */
- jbd_debug(3, "JBD: commit phase 8\n");
+ jbd_debug(3, "JBD: commit phase 7\n");
J_ASSERT(commit_transaction->t_state == T_COMMIT);
@@ -1058,6 +995,9 @@ restart_loop:
}
spin_unlock(&journal->j_list_lock);
+ trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
+ journal->j_devname, commit_transaction->t_tid,
+ journal->j_tail_sequence);
jbd_debug(1, "JBD: commit %d complete, head %d\n",
journal->j_commit_sequence, journal->j_tail_sequence);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 2e24567c4a7..783de118de9 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -50,7 +50,6 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates);
EXPORT_SYMBOL(jbd2_journal_get_write_access);
EXPORT_SYMBOL(jbd2_journal_get_create_access);
EXPORT_SYMBOL(jbd2_journal_get_undo_access);
-EXPORT_SYMBOL(jbd2_journal_dirty_data);
EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
EXPORT_SYMBOL(jbd2_journal_release_buffer);
EXPORT_SYMBOL(jbd2_journal_forget);
@@ -69,7 +68,6 @@ EXPORT_SYMBOL(jbd2_journal_set_features);
EXPORT_SYMBOL(jbd2_journal_create);
EXPORT_SYMBOL(jbd2_journal_load);
EXPORT_SYMBOL(jbd2_journal_destroy);
-EXPORT_SYMBOL(jbd2_journal_update_superblock);
EXPORT_SYMBOL(jbd2_journal_abort);
EXPORT_SYMBOL(jbd2_journal_errno);
EXPORT_SYMBOL(jbd2_journal_ack_err);
@@ -82,6 +80,10 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
EXPORT_SYMBOL(jbd2_journal_invalidatepage);
EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
EXPORT_SYMBOL(jbd2_journal_force_commit);
+EXPORT_SYMBOL(jbd2_journal_file_inode);
+EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
+EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
+EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
static void __journal_abort_soft (journal_t *journal, int errno);
@@ -595,13 +597,9 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
if (ret)
*retp = ret;
else {
- char b[BDEVNAME_SIZE];
-
printk(KERN_ALERT "%s: journal block not found "
"at offset %lu on %s\n",
- __func__,
- blocknr,
- bdevname(journal->j_dev, b));
+ __func__, blocknr, journal->j_devname);
err = -EIO;
__journal_abort_soft(journal, err);
}
@@ -899,10 +897,7 @@ static struct proc_dir_entry *proc_jbd2_stats;
static void jbd2_stats_proc_init(journal_t *journal)
{
- char name[BDEVNAME_SIZE];
-
- bdevname(journal->j_dev, name);
- journal->j_proc_entry = proc_mkdir(name, proc_jbd2_stats);
+ journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats);
if (journal->j_proc_entry) {
proc_create_data("history", S_IRUGO, journal->j_proc_entry,
&jbd2_seq_history_fops, journal);
@@ -913,12 +908,9 @@ static void jbd2_stats_proc_init(journal_t *journal)
static void jbd2_stats_proc_exit(journal_t *journal)
{
- char name[BDEVNAME_SIZE];
-
- bdevname(journal->j_dev, name);
remove_proc_entry("info", journal->j_proc_entry);
remove_proc_entry("history", journal->j_proc_entry);
- remove_proc_entry(name, proc_jbd2_stats);
+ remove_proc_entry(journal->j_devname, proc_jbd2_stats);
}
static void journal_init_stats(journal_t *journal)
@@ -1016,6 +1008,7 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
{
journal_t *journal = journal_init_common();
struct buffer_head *bh;
+ char *p;
int n;
if (!journal)
@@ -1037,6 +1030,10 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
journal->j_fs_dev = fs_dev;
journal->j_blk_offset = start;
journal->j_maxlen = len;
+ bdevname(journal->j_dev, journal->j_devname);
+ p = journal->j_devname;
+ while ((p = strchr(p, '/')))
+ *p = '!';
jbd2_stats_proc_init(journal);
bh = __getblk(journal->j_dev, start, journal->j_blocksize);
@@ -1059,6 +1056,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
{
struct buffer_head *bh;
journal_t *journal = journal_init_common();
+ char *p;
int err;
int n;
unsigned long long blocknr;
@@ -1068,6 +1066,12 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev;
journal->j_inode = inode;
+ bdevname(journal->j_dev, journal->j_devname);
+ p = journal->j_devname;
+ while ((p = strchr(p, '/')))
+ *p = '!';
+ p = journal->j_devname + strlen(journal->j_devname);
+ sprintf(p, ":%lu", journal->j_inode->i_ino);
jbd_debug(1,
"journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
journal, inode->i_sb->s_id, inode->i_ino,
@@ -1251,6 +1255,22 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
goto out;
}
+ if (buffer_write_io_error(bh)) {
+ /*
+ * Oh, dear. A previous attempt to write the journal
+ * superblock failed. This could happen because the
+ * USB device was yanked out. Or it could happen to
+ * be a transient write error and maybe the block will
+ * be remapped. Nothing we can do but to retry the
+ * write and hope for the best.
+ */
+ printk(KERN_ERR "JBD2: previous I/O error detected "
+ "for journal superblock update for %s.\n",
+ journal->j_devname);
+ clear_buffer_write_io_error(bh);
+ set_buffer_uptodate(bh);
+ }
+
spin_lock(&journal->j_state_lock);
jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
journal->j_tail, journal->j_tail_sequence, journal->j_errno);
@@ -1262,9 +1282,16 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
BUFFER_TRACE(bh, "marking dirty");
mark_buffer_dirty(bh);
- if (wait)
+ if (wait) {
sync_dirty_buffer(bh);
- else
+ if (buffer_write_io_error(bh)) {
+ printk(KERN_ERR "JBD2: I/O error detected "
+ "when updating journal superblock for %s.\n",
+ journal->j_devname);
+ clear_buffer_write_io_error(bh);
+ set_buffer_uptodate(bh);
+ }
+ } else
ll_rw_block(SWRITE, 1, &bh);
out:
@@ -1424,9 +1451,12 @@ recovery_error:
*
* Release a journal_t structure once it is no longer in use by the
* journaled object.
+ * Return <0 if we couldn't clean up the journal.
*/
-void jbd2_journal_destroy(journal_t *journal)
+int jbd2_journal_destroy(journal_t *journal)
{
+ int err = 0;
+
/* Wait for the commit thread to wake up and die. */
journal_kill_thread(journal);
@@ -1449,11 +1479,16 @@ void jbd2_journal_destroy(journal_t *journal)
J_ASSERT(journal->j_checkpoint_transactions == NULL);
spin_unlock(&journal->j_list_lock);
- /* We can now mark the journal as empty. */
- journal->j_tail = 0;
- journal->j_tail_sequence = ++journal->j_transaction_sequence;
if (journal->j_sb_buffer) {
- jbd2_journal_update_superblock(journal, 1);
+ if (!is_journal_aborted(journal)) {
+ /* We can now mark the journal as empty. */
+ journal->j_tail = 0;
+ journal->j_tail_sequence =
+ ++journal->j_transaction_sequence;
+ jbd2_journal_update_superblock(journal, 1);
+ } else {
+ err = -EIO;
+ }
brelse(journal->j_sb_buffer);
}
@@ -1465,6 +1500,8 @@ void jbd2_journal_destroy(journal_t *journal)
jbd2_journal_destroy_revoke(journal);
kfree(journal->j_wbuf);
kfree(journal);
+
+ return err;
}
@@ -1690,10 +1727,16 @@ int jbd2_journal_flush(journal_t *journal)
spin_lock(&journal->j_list_lock);
while (!err && journal->j_checkpoint_transactions != NULL) {
spin_unlock(&journal->j_list_lock);
+ mutex_lock(&journal->j_checkpoint_mutex);
err = jbd2_log_do_checkpoint(journal);
+ mutex_unlock(&journal->j_checkpoint_mutex);
spin_lock(&journal->j_list_lock);
}
spin_unlock(&journal->j_list_lock);
+
+ if (is_journal_aborted(journal))
+ return -EIO;
+
jbd2_cleanup_journal_tail(journal);
/* Finally, mark the journal as really needing no recovery.
@@ -1715,7 +1758,7 @@ int jbd2_journal_flush(journal_t *journal)
J_ASSERT(journal->j_head == journal->j_tail);
J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
spin_unlock(&journal->j_state_lock);
- return err;
+ return 0;
}
/**
@@ -1759,23 +1802,6 @@ int jbd2_journal_wipe(journal_t *journal, int write)
}
/*
- * journal_dev_name: format a character string to describe on what
- * device this journal is present.
- */
-
-static const char *journal_dev_name(journal_t *journal, char *buffer)
-{
- struct block_device *bdev;
-
- if (journal->j_inode)
- bdev = journal->j_inode->i_sb->s_bdev;
- else
- bdev = journal->j_dev;
-
- return bdevname(bdev, buffer);
-}
-
-/*
* Journal abort has very specific semantics, which we describe
* for journal abort.
*
@@ -1791,13 +1817,12 @@ static const char *journal_dev_name(journal_t *journal, char *buffer)
void __jbd2_journal_abort_hard(journal_t *journal)
{
transaction_t *transaction;
- char b[BDEVNAME_SIZE];
if (journal->j_flags & JBD2_ABORT)
return;
printk(KERN_ERR "Aborting journal on device %s.\n",
- journal_dev_name(journal, b));
+ journal->j_devname);
spin_lock(&journal->j_state_lock);
journal->j_flags |= JBD2_ABORT;
@@ -2195,6 +2220,54 @@ void jbd2_journal_put_journal_head(struct journal_head *jh)
}
/*
+ * Initialize jbd inode head
+ */
+void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode)
+{
+ jinode->i_transaction = NULL;
+ jinode->i_next_transaction = NULL;
+ jinode->i_vfs_inode = inode;
+ jinode->i_flags = 0;
+ INIT_LIST_HEAD(&jinode->i_list);
+}
+
+/*
+ * Function to be called before we start removing inode from memory (i.e.,
+ * clear_inode() is a fine place to be called from). It removes inode from
+ * transaction's lists.
+ */
+void jbd2_journal_release_jbd_inode(journal_t *journal,
+ struct jbd2_inode *jinode)
+{
+ int writeout = 0;
+
+ if (!journal)
+ return;
+restart:
+ spin_lock(&journal->j_list_lock);
+ /* Is commit writing out inode - we have to wait */
+ if (jinode->i_flags & JI_COMMIT_RUNNING) {
+ wait_queue_head_t *wq;
+ DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
+ wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
+ prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&journal->j_list_lock);
+ schedule();
+ finish_wait(wq, &wait.wait);
+ goto restart;
+ }
+
+ /* Do we need to wait for data writeback? */
+ if (journal->j_committing_transaction == jinode->i_transaction)
+ writeout = 1;
+ if (jinode->i_transaction) {
+ list_del(&jinode->i_list);
+ jinode->i_transaction = NULL;
+ }
+ spin_unlock(&journal->j_list_lock);
+}
+
+/*
* debugfs tunables
*/
#ifdef CONFIG_JBD2_DEBUG
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 058f50f65b7..73063285b13 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -225,7 +225,7 @@ do { \
*/
int jbd2_journal_recover(journal_t *journal)
{
- int err;
+ int err, err2;
journal_superblock_t * sb;
struct recovery_info info;
@@ -263,7 +263,10 @@ int jbd2_journal_recover(journal_t *journal)
journal->j_transaction_sequence = ++info.end_transaction;
jbd2_journal_clear_revoke(journal);
- sync_blockdev(journal->j_fs_dev);
+ err2 = sync_blockdev(journal->j_fs_dev);
+ if (!err)
+ err = err2;
+
return err;
}
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index d6e006e6780..e5d540588fa 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -41,7 +41,6 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
* new transaction and we can't block without protecting against other
* processes trying to touch the journal while it is in transition.
*
- * Called under j_state_lock
*/
static transaction_t *
@@ -52,6 +51,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
transaction->t_tid = journal->j_transaction_sequence++;
transaction->t_expires = jiffies + journal->j_commit_interval;
spin_lock_init(&transaction->t_handle_lock);
+ INIT_LIST_HEAD(&transaction->t_inode_list);
/* Set up the commit timer for the new transaction. */
journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
@@ -301,7 +301,7 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
goto out;
}
- lock_acquire(&handle->h_lockdep_map, 0, 0, 0, 2, _THIS_IP_);
+ lock_map_acquire(&handle->h_lockdep_map);
out:
return handle;
}
@@ -943,183 +943,6 @@ out:
}
/**
- * int jbd2_journal_dirty_data() - mark a buffer as containing dirty data which
- * needs to be flushed before we can commit the
- * current transaction.
- * @handle: transaction
- * @bh: bufferhead to mark
- *
- * The buffer is placed on the transaction's data list and is marked as
- * belonging to the transaction.
- *
- * Returns error number or 0 on success.
- *
- * jbd2_journal_dirty_data() can be called via page_launder->ext3_writepage
- * by kswapd.
- */
-int jbd2_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
-{
- journal_t *journal = handle->h_transaction->t_journal;
- int need_brelse = 0;
- struct journal_head *jh;
-
- if (is_handle_aborted(handle))
- return 0;
-
- jh = jbd2_journal_add_journal_head(bh);
- JBUFFER_TRACE(jh, "entry");
-
- /*
- * The buffer could *already* be dirty. Writeout can start
- * at any time.
- */
- jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
-
- /*
- * What if the buffer is already part of a running transaction?
- *
- * There are two cases:
- * 1) It is part of the current running transaction. Refile it,
- * just in case we have allocated it as metadata, deallocated
- * it, then reallocated it as data.
- * 2) It is part of the previous, still-committing transaction.
- * If all we want to do is to guarantee that the buffer will be
- * written to disk before this new transaction commits, then
- * being sure that the *previous* transaction has this same
- * property is sufficient for us! Just leave it on its old
- * transaction.
- *
- * In case (2), the buffer must not already exist as metadata
- * --- that would violate write ordering (a transaction is free
- * to write its data at any point, even before the previous
- * committing transaction has committed). The caller must
- * never, ever allow this to happen: there's nothing we can do
- * about it in this layer.
- */
- jbd_lock_bh_state(bh);
- spin_lock(&journal->j_list_lock);
-
- /* Now that we have bh_state locked, are we really still mapped? */
- if (!buffer_mapped(bh)) {
- JBUFFER_TRACE(jh, "unmapped buffer, bailing out");
- goto no_journal;
- }
-
- if (jh->b_transaction) {
- JBUFFER_TRACE(jh, "has transaction");
- if (jh->b_transaction != handle->h_transaction) {
- JBUFFER_TRACE(jh, "belongs to older transaction");
- J_ASSERT_JH(jh, jh->b_transaction ==
- journal->j_committing_transaction);
-
- /* @@@ IS THIS TRUE ? */
- /*
- * Not any more. Scenario: someone does a write()
- * in data=journal mode. The buffer's transaction has
- * moved into commit. Then someone does another
- * write() to the file. We do the frozen data copyout
- * and set b_next_transaction to point to j_running_t.
- * And while we're in that state, someone does a
- * writepage() in an attempt to pageout the same area
- * of the file via a shared mapping. At present that
- * calls jbd2_journal_dirty_data(), and we get right here.
- * It may be too late to journal the data. Simply
- * falling through to the next test will suffice: the
- * data will be dirty and wil be checkpointed. The
- * ordering comments in the next comment block still
- * apply.
- */
- //J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
-
- /*
- * If we're journalling data, and this buffer was
- * subject to a write(), it could be metadata, forget
- * or shadow against the committing transaction. Now,
- * someone has dirtied the same darn page via a mapping
- * and it is being writepage()'d.
- * We *could* just steal the page from commit, with some
- * fancy locking there. Instead, we just skip it -
- * don't tie the page's buffers to the new transaction
- * at all.
- * Implication: if we crash before the writepage() data
- * is written into the filesystem, recovery will replay
- * the write() data.
- */
- if (jh->b_jlist != BJ_None &&
- jh->b_jlist != BJ_SyncData &&
- jh->b_jlist != BJ_Locked) {
- JBUFFER_TRACE(jh, "Not stealing");
- goto no_journal;
- }
-
- /*
- * This buffer may be undergoing writeout in commit. We
- * can't return from here and let the caller dirty it
- * again because that can cause the write-out loop in
- * commit to never terminate.
- */
- if (buffer_dirty(bh)) {
- get_bh(bh);
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- need_brelse = 1;
- sync_dirty_buffer(bh);
- jbd_lock_bh_state(bh);
- spin_lock(&journal->j_list_lock);
- /* Since we dropped the lock... */
- if (!buffer_mapped(bh)) {
- JBUFFER_TRACE(jh, "buffer got unmapped");
- goto no_journal;
- }
- /* The buffer may become locked again at any
- time if it is redirtied */
- }
-
- /* journal_clean_data_list() may have got there first */
- if (jh->b_transaction != NULL) {
- JBUFFER_TRACE(jh, "unfile from commit");
- __jbd2_journal_temp_unlink_buffer(jh);
- /* It still points to the committing
- * transaction; move it to this one so
- * that the refile assert checks are
- * happy. */
- jh->b_transaction = handle->h_transaction;
- }
- /* The buffer will be refiled below */
-
- }
- /*
- * Special case --- the buffer might actually have been
- * allocated and then immediately deallocated in the previous,
- * committing transaction, so might still be left on that
- * transaction's metadata lists.
- */
- if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
- JBUFFER_TRACE(jh, "not on correct data list: unfile");
- J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
- __jbd2_journal_temp_unlink_buffer(jh);
- jh->b_transaction = handle->h_transaction;
- JBUFFER_TRACE(jh, "file as data");
- __jbd2_journal_file_buffer(jh, handle->h_transaction,
- BJ_SyncData);
- }
- } else {
- JBUFFER_TRACE(jh, "not on a transaction");
- __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
- }
-no_journal:
- spin_unlock(&journal->j_list_lock);
- jbd_unlock_bh_state(bh);
- if (need_brelse) {
- BUFFER_TRACE(bh, "brelse");
- __brelse(bh);
- }
- JBUFFER_TRACE(jh, "exit");
- jbd2_journal_put_journal_head(jh);
- return 0;
-}
-
-/**
* int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata
* @handle: transaction to add buffer to.
* @bh: buffer to mark
@@ -1456,7 +1279,7 @@ int jbd2_journal_stop(handle_t *handle)
spin_unlock(&journal->j_state_lock);
}
- lock_release(&handle->h_lockdep_map, 1, _THIS_IP_);
+ lock_map_release(&handle->h_lockdep_map);
jbd2_free_handle(handle);
return err;
@@ -1541,10 +1364,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
* Remove a buffer from the appropriate transaction list.
*
* Note that this function can *change* the value of
- * bh->b_transaction->t_sync_datalist, t_buffers, t_forget,
- * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller
- * is holding onto a copy of one of thee pointers, it could go bad.
- * Generally the caller needs to re-read the pointer from the transaction_t.
+ * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list,
+ * t_log_list or t_reserved_list. If the caller is holding onto a copy of one
+ * of these pointers, it could go bad. Generally the caller needs to re-read
+ * the pointer from the transaction_t.
*
* Called under j_list_lock. The journal may not be locked.
*/
@@ -1566,9 +1389,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
switch (jh->b_jlist) {
case BJ_None:
return;
- case BJ_SyncData:
- list = &transaction->t_sync_datalist;
- break;
case BJ_Metadata:
transaction->t_nr_buffers--;
J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
@@ -1589,9 +1409,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
case BJ_Reserved:
list = &transaction->t_reserved_list;
break;
- case BJ_Locked:
- list = &transaction->t_locked_list;
- break;
}
__blist_del_buffer(list, jh);
@@ -1634,15 +1451,7 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
goto out;
spin_lock(&journal->j_list_lock);
- if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) {
- if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
- /* A written-back ordered data buffer */
- JBUFFER_TRACE(jh, "release data");
- __jbd2_journal_unfile_buffer(jh);
- jbd2_journal_remove_journal_head(bh);
- __brelse(bh);
- }
- } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
+ if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
/* written-back checkpointed metadata buffer */
if (jh->b_jlist == BJ_None) {
JBUFFER_TRACE(jh, "remove from checkpoint list");
@@ -1656,12 +1465,43 @@ out:
return;
}
+/*
+ * jbd2_journal_try_to_free_buffers() could race with
+ * jbd2_journal_commit_transaction(). The later might still hold the
+ * reference count to the buffers when inspecting them on
+ * t_syncdata_list or t_locked_list.
+ *
+ * jbd2_journal_try_to_free_buffers() will call this function to
+ * wait for the current transaction to finish syncing data buffers, before
+ * try to free that buffer.
+ *
+ * Called with journal->j_state_lock hold.
+ */
+static void jbd2_journal_wait_for_transaction_sync_data(journal_t *journal)
+{
+ transaction_t *transaction;
+ tid_t tid;
+
+ spin_lock(&journal->j_state_lock);
+ transaction = journal->j_committing_transaction;
+
+ if (!transaction) {
+ spin_unlock(&journal->j_state_lock);
+ return;
+ }
+
+ tid = transaction->t_tid;
+ spin_unlock(&journal->j_state_lock);
+ jbd2_log_wait_commit(journal, tid);
+}
/**
* int jbd2_journal_try_to_free_buffers() - try to free page buffers.
* @journal: journal for operation
* @page: to try and free
- * @unused_gfp_mask: unused
+ * @gfp_mask: we use the mask to detect how hard should we try to release
+ * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
+ * release the buffers.
*
*
* For all the buffers on this page,
@@ -1690,9 +1530,11 @@ out:
* journal_try_to_free_buffer() is changing its state. But that
* cannot happen because we never reallocate freed data as metadata
* while the data is part of a transaction. Yes?
+ *
+ * Return 0 on failure, 1 on success
*/
int jbd2_journal_try_to_free_buffers(journal_t *journal,
- struct page *page, gfp_t unused_gfp_mask)
+ struct page *page, gfp_t gfp_mask)
{
struct buffer_head *head;
struct buffer_head *bh;
@@ -1708,7 +1550,8 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
/*
* We take our own ref against the journal_head here to avoid
* having to add tons of locking around each instance of
- * jbd2_journal_remove_journal_head() and jbd2_journal_put_journal_head().
+ * jbd2_journal_remove_journal_head() and
+ * jbd2_journal_put_journal_head().
*/
jh = jbd2_journal_grab_journal_head(bh);
if (!jh)
@@ -1721,7 +1564,28 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
if (buffer_jbd(bh))
goto busy;
} while ((bh = bh->b_this_page) != head);
+
ret = try_to_free_buffers(page);
+
+ /*
+ * There are a number of places where jbd2_journal_try_to_free_buffers()
+ * could race with jbd2_journal_commit_transaction(), the later still
+ * holds the reference to the buffers to free while processing them.
+ * try_to_free_buffers() failed to free those buffers. Some of the
+ * caller of releasepage() request page buffers to be dropped, otherwise
+ * treat the fail-to-free as errors (such as generic_file_direct_IO())
+ *
+ * So, if the caller of try_to_release_page() wants the synchronous
+ * behaviour(i.e make sure buffers are dropped upon return),
+ * let's wait for the current transaction to finish flush of
+ * dirty data buffers, then try to free those buffers again,
+ * with the journal locked.
+ */
+ if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) {
+ jbd2_journal_wait_for_transaction_sync_data(journal);
+ ret = try_to_free_buffers(page);
+ }
+
busy:
return ret;
}
@@ -1823,6 +1687,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
if (!buffer_jbd(bh))
goto zap_buffer_unlocked;
+ /* OK, we have data buffer in journaled mode */
spin_lock(&journal->j_state_lock);
jbd_lock_bh_state(bh);
spin_lock(&journal->j_list_lock);
@@ -1886,15 +1751,6 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
}
} else if (transaction == journal->j_committing_transaction) {
JBUFFER_TRACE(jh, "on committing transaction");
- if (jh->b_jlist == BJ_Locked) {
- /*
- * The buffer is on the committing transaction's locked
- * list. We have the buffer locked, so I/O has
- * completed. So we can nail the buffer now.
- */
- may_free = __dispose_buffer(jh, transaction);
- goto zap_buffer;
- }
/*
* If it is committing, we simply cannot touch it. We
* can remove it's next_transaction pointer from the
@@ -2027,9 +1883,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
J_ASSERT_JH(jh, !jh->b_committed_data);
J_ASSERT_JH(jh, !jh->b_frozen_data);
return;
- case BJ_SyncData:
- list = &transaction->t_sync_datalist;
- break;
case BJ_Metadata:
transaction->t_nr_buffers++;
list = &transaction->t_buffers;
@@ -2049,9 +1902,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
case BJ_Reserved:
list = &transaction->t_reserved_list;
break;
- case BJ_Locked:
- list = &transaction->t_locked_list;
- break;
}
__blist_add_buffer(list, jh);
@@ -2141,3 +1991,88 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
spin_unlock(&journal->j_list_lock);
__brelse(bh);
}
+
+/*
+ * File inode in the inode list of the handle's transaction
+ */
+int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
+{
+ transaction_t *transaction = handle->h_transaction;
+ journal_t *journal = transaction->t_journal;
+
+ if (is_handle_aborted(handle))
+ return -EIO;
+
+ jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
+ transaction->t_tid);
+
+ /*
+ * First check whether inode isn't already on the transaction's
+ * lists without taking the lock. Note that this check is safe
+ * without the lock as we cannot race with somebody removing inode
+ * from the transaction. The reason is that we remove inode from the
+ * transaction only in journal_release_jbd_inode() and when we commit
+ * the transaction. We are guarded from the first case by holding
+ * a reference to the inode. We are safe against the second case
+ * because if jinode->i_transaction == transaction, commit code
+ * cannot touch the transaction because we hold reference to it,
+ * and if jinode->i_next_transaction == transaction, commit code
+ * will only file the inode where we want it.
+ */
+ if (jinode->i_transaction == transaction ||
+ jinode->i_next_transaction == transaction)
+ return 0;
+
+ spin_lock(&journal->j_list_lock);
+
+ if (jinode->i_transaction == transaction ||
+ jinode->i_next_transaction == transaction)
+ goto done;
+
+ /* On some different transaction's list - should be
+ * the committing one */
+ if (jinode->i_transaction) {
+ J_ASSERT(jinode->i_next_transaction == NULL);
+ J_ASSERT(jinode->i_transaction ==
+ journal->j_committing_transaction);
+ jinode->i_next_transaction = transaction;
+ goto done;
+ }
+ /* Not on any transaction list... */
+ J_ASSERT(!jinode->i_next_transaction);
+ jinode->i_transaction = transaction;
+ list_add(&jinode->i_list, &transaction->t_inode_list);
+done:
+ spin_unlock(&journal->j_list_lock);
+
+ return 0;
+}
+
+/*
+ * This function must be called when inode is journaled in ordered mode
+ * before truncation happens. It starts writeout of truncated part in
+ * case it is in the committing transaction so that we stand to ordered
+ * mode consistency guarantees.
+ */
+int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
+ loff_t new_size)
+{
+ journal_t *journal;
+ transaction_t *commit_trans;
+ int ret = 0;
+
+ if (!inode->i_transaction && !inode->i_next_transaction)
+ goto out;
+ journal = inode->i_transaction->t_journal;
+ spin_lock(&journal->j_state_lock);
+ commit_trans = journal->j_committing_transaction;
+ spin_unlock(&journal->j_state_lock);
+ if (inode->i_transaction == commit_trans) {
+ ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
+ new_size, LLONG_MAX);
+ if (ret)
+ jbd2_journal_abort(journal, ret);
+ }
+out:
+ return ret;
+}
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 4c80404a9ab..d98713777a1 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -314,7 +314,7 @@ static int jffs2_check_acl(struct inode *inode, int mask)
return -EAGAIN;
}
-int jffs2_permission(struct inode *inode, int mask, struct nameidata *nd)
+int jffs2_permission(struct inode *inode, int mask)
{
return generic_permission(inode, mask, jffs2_check_acl);
}
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index 0bb7f003fd8..8ca058aed38 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -28,7 +28,7 @@ struct jffs2_acl_header {
#define JFFS2_ACL_NOT_CACHED ((void *)-1)
-extern int jffs2_permission(struct inode *, int, struct nameidata *);
+extern int jffs2_permission(struct inode *, int);
extern int jffs2_acl_chmod(struct inode *);
extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *);
extern int jffs2_init_acl_post(struct inode *);
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index c0c141f6fde..cd219ef5525 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -38,7 +38,7 @@ const struct file_operations jffs2_dir_operations =
{
.read = generic_read_dir,
.readdir = jffs2_readdir,
- .ioctl = jffs2_ioctl,
+ .unlocked_ioctl=jffs2_ioctl,
.fsync = jffs2_fsync
};
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 5e920343b2c..5a98aa87c85 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -46,7 +46,7 @@ const struct file_operations jffs2_file_operations =
.aio_read = generic_file_aio_read,
.write = do_sync_write,
.aio_write = generic_file_aio_write,
- .ioctl = jffs2_ioctl,
+ .unlocked_ioctl=jffs2_ioctl,
.mmap = generic_file_readonly_mmap,
.fsync = jffs2_fsync,
.splice_read = generic_file_splice_read,
diff --git a/fs/jffs2/ioctl.c b/fs/jffs2/ioctl.c
index e2177210f62..9d41f43e47b 100644
--- a/fs/jffs2/ioctl.c
+++ b/fs/jffs2/ioctl.c
@@ -12,8 +12,7 @@
#include <linux/fs.h>
#include "nodelist.h"
-int jffs2_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
- unsigned long arg)
+long jffs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
/* Later, this will provide for lsattr.jffs2 and chattr.jffs2, which
will include compression support etc. */
diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h
index 31559f45fdd..4c41db91eaa 100644
--- a/fs/jffs2/jffs2_fs_i.h
+++ b/fs/jffs2/jffs2_fs_i.h
@@ -12,7 +12,6 @@
#ifndef _JFFS2_FS_I
#define _JFFS2_FS_I
-#include <linux/version.h>
#include <linux/rbtree.h>
#include <linux/posix_acl.h>
#include <linux/mutex.h>
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 2cc866cf134..5e194a5c8e2 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -167,7 +167,7 @@ int jffs2_fsync(struct file *, struct dentry *, int);
int jffs2_do_readpage_unlock (struct inode *inode, struct page *pg);
/* ioctl.c */
-int jffs2_ioctl(struct inode *, struct file *, unsigned int, unsigned long);
+long jffs2_ioctl(struct file *, unsigned int, unsigned long);
/* symlink.c */
extern const struct inode_operations jffs2_symlink_inode_operations;
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 629af01e5ad..6caf1e1ee26 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -23,6 +23,8 @@
int jffs2_sum_init(struct jffs2_sb_info *c)
{
+ uint32_t sum_size = max_t(uint32_t, c->sector_size, MAX_SUMMARY_SIZE);
+
c->summary = kzalloc(sizeof(struct jffs2_summary), GFP_KERNEL);
if (!c->summary) {
@@ -30,7 +32,7 @@ int jffs2_sum_init(struct jffs2_sb_info *c)
return -ENOMEM;
}
- c->summary->sum_buf = vmalloc(c->sector_size);
+ c->summary->sum_buf = kmalloc(sum_size, GFP_KERNEL);
if (!c->summary->sum_buf) {
JFFS2_WARNING("Can't allocate buffer for writing out summary information!\n");
@@ -49,7 +51,7 @@ void jffs2_sum_exit(struct jffs2_sb_info *c)
jffs2_sum_disable_collecting(c->summary);
- vfree(c->summary->sum_buf);
+ kfree(c->summary->sum_buf);
c->summary->sum_buf = NULL;
kfree(c->summary);
@@ -665,7 +667,7 @@ crc_err:
/* Write summary data to flash - helper function for jffs2_sum_write_sumnode() */
static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb,
- uint32_t infosize, uint32_t datasize, int padsize)
+ uint32_t infosize, uint32_t datasize, int padsize)
{
struct jffs2_raw_summary isum;
union jffs2_sum_mem *temp;
@@ -676,6 +678,26 @@ static int jffs2_sum_write_data(struct jffs2_sb_info *c, struct jffs2_eraseblock
int ret;
size_t retlen;
+ if (padsize + datasize > MAX_SUMMARY_SIZE) {
+ /* It won't fit in the buffer. Abort summary for this jeb */
+ jffs2_sum_disable_collecting(c->summary);
+
+ JFFS2_WARNING("Summary too big (%d data, %d pad) in eraseblock at %08x\n",
+ datasize, padsize, jeb->offset);
+ /* Non-fatal */
+ return 0;
+ }
+ /* Is there enough space for summary? */
+ if (padsize < 0) {
+ /* don't try to write out summary for this jeb */
+ jffs2_sum_disable_collecting(c->summary);
+
+ JFFS2_WARNING("Not enough space for summary, padsize = %d\n",
+ padsize);
+ /* Non-fatal */
+ return 0;
+ }
+
memset(c->summary->sum_buf, 0xff, datasize);
memset(&isum, 0, sizeof(isum));
@@ -821,7 +843,7 @@ int jffs2_sum_write_sumnode(struct jffs2_sb_info *c)
{
int datasize, infosize, padsize;
struct jffs2_eraseblock *jeb;
- int ret;
+ int ret = 0;
dbg_summary("called\n");
@@ -841,16 +863,6 @@ int jffs2_sum_write_sumnode(struct jffs2_sb_info *c)
infosize += padsize;
datasize += padsize;
- /* Is there enough space for summary? */
- if (padsize < 0) {
- /* don't try to write out summary for this jeb */
- jffs2_sum_disable_collecting(c->summary);
-
- JFFS2_WARNING("Not enough space for summary, padsize = %d\n", padsize);
- spin_lock(&c->erase_completion_lock);
- return 0;
- }
-
ret = jffs2_sum_write_data(c, jeb, infosize, datasize, padsize);
spin_lock(&c->erase_completion_lock);
return ret;
diff --git a/fs/jffs2/summary.h b/fs/jffs2/summary.h
index 8bf34f2fa5c..60207a2ae95 100644
--- a/fs/jffs2/summary.h
+++ b/fs/jffs2/summary.h
@@ -13,6 +13,12 @@
#ifndef JFFS2_SUMMARY_H
#define JFFS2_SUMMARY_H
+/* Limit summary size to 64KiB so that we can kmalloc it. If the summary
+ is larger than that, we have to just ditch it and avoid using summary
+ for the eraseblock in question... and it probably doesn't hurt us much
+ anyway. */
+#define MAX_SUMMARY_SIZE 65536
+
#include <linux/uio.h>
#include <linux/jffs2.h>
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 7da69eae49e..efd401257ed 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -44,7 +44,7 @@ static void jffs2_destroy_inode(struct inode *inode)
kmem_cache_free(jffs2_inode_cachep, JFFS2_INODE_INFO(inode));
}
-static void jffs2_i_init_once(struct kmem_cache *cachep, void *foo)
+static void jffs2_i_init_once(void *foo)
{
struct jffs2_inode_info *f = foo;
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 4d84bdc8829..d3e5c33665d 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -140,7 +140,7 @@ static int jfs_check_acl(struct inode *inode, int mask)
return -EAGAIN;
}
-int jfs_permission(struct inode *inode, int mask, struct nameidata *nd)
+int jfs_permission(struct inode *inode, int mask)
{
return generic_permission(inode, mask, jfs_check_acl);
}
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index 455fa429204..88475f10a38 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -20,7 +20,7 @@
#ifdef CONFIG_JFS_POSIX_ACL
-int jfs_permission(struct inode *, int, struct nameidata *);
+int jfs_permission(struct inode *, int);
int jfs_init_acl(tid_t, struct inode *, struct inode *);
int jfs_setattr(struct dentry *, struct iattr *);
diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c
index bf6ab19b86e..6a73de84bce 100644
--- a/fs/jfs/jfs_debug.c
+++ b/fs/jfs/jfs_debug.c
@@ -21,6 +21,7 @@
#include <linux/ctype.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
#include <asm/uaccess.h>
#include "jfs_incore.h"
#include "jfs_filsys.h"
@@ -30,29 +31,19 @@
static struct proc_dir_entry *base;
#ifdef CONFIG_JFS_DEBUG
-static int loglevel_read(char *page, char **start, off_t off,
- int count, int *eof, void *data)
+static int jfs_loglevel_proc_show(struct seq_file *m, void *v)
{
- int len;
-
- len = sprintf(page, "%d\n", jfsloglevel);
-
- len -= off;
- *start = page + off;
-
- if (len > count)
- len = count;
- else
- *eof = 1;
-
- if (len < 0)
- len = 0;
+ seq_printf(m, "%d\n", jfsloglevel);
+ return 0;
+}
- return len;
+static int jfs_loglevel_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, jfs_loglevel_proc_show, NULL);
}
-static int loglevel_write(struct file *file, const char __user *buffer,
- unsigned long count, void *data)
+static ssize_t jfs_loglevel_proc_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *ppos)
{
char c;
@@ -65,22 +56,30 @@ static int loglevel_write(struct file *file, const char __user *buffer,
jfsloglevel = c - '0';
return count;
}
+
+static const struct file_operations jfs_loglevel_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = jfs_loglevel_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+ .write = jfs_loglevel_proc_write,
+};
#endif
static struct {
const char *name;
- read_proc_t *read_fn;
- write_proc_t *write_fn;
+ const struct file_operations *proc_fops;
} Entries[] = {
#ifdef CONFIG_JFS_STATISTICS
- { "lmstats", jfs_lmstats_read, },
- { "txstats", jfs_txstats_read, },
- { "xtstat", jfs_xtstat_read, },
- { "mpstat", jfs_mpstat_read, },
+ { "lmstats", &jfs_lmstats_proc_fops, },
+ { "txstats", &jfs_txstats_proc_fops, },
+ { "xtstat", &jfs_xtstat_proc_fops, },
+ { "mpstat", &jfs_mpstat_proc_fops, },
#endif
#ifdef CONFIG_JFS_DEBUG
- { "TxAnchor", jfs_txanchor_read, },
- { "loglevel", loglevel_read, loglevel_write }
+ { "TxAnchor", &jfs_txanchor_proc_fops, },
+ { "loglevel", &jfs_loglevel_proc_fops }
#endif
};
#define NPROCENT ARRAY_SIZE(Entries)
@@ -93,13 +92,8 @@ void jfs_proc_init(void)
return;
base->owner = THIS_MODULE;
- for (i = 0; i < NPROCENT; i++) {
- struct proc_dir_entry *p;
- if ((p = create_proc_entry(Entries[i].name, 0, base))) {
- p->read_proc = Entries[i].read_fn;
- p->write_proc = Entries[i].write_fn;
- }
- }
+ for (i = 0; i < NPROCENT; i++)
+ proc_create(Entries[i].name, 0, base, Entries[i].proc_fops);
}
void jfs_proc_clean(void)
diff --git a/fs/jfs/jfs_debug.h b/fs/jfs/jfs_debug.h
index 044c1e654cc..eafd1300a00 100644
--- a/fs/jfs/jfs_debug.h
+++ b/fs/jfs/jfs_debug.h
@@ -62,7 +62,7 @@ extern void jfs_proc_clean(void);
extern int jfsloglevel;
-extern int jfs_txanchor_read(char *, char **, off_t, int, int *, void *);
+extern const struct file_operations jfs_txanchor_proc_fops;
/* information message: e.g., configuration, major event */
#define jfs_info(fmt, arg...) do { \
@@ -105,10 +105,10 @@ extern int jfs_txanchor_read(char *, char **, off_t, int, int *, void *);
* ----------
*/
#ifdef CONFIG_JFS_STATISTICS
-extern int jfs_lmstats_read(char *, char **, off_t, int, int *, void *);
-extern int jfs_txstats_read(char *, char **, off_t, int, int *, void *);
-extern int jfs_mpstat_read(char *, char **, off_t, int, int *, void *);
-extern int jfs_xtstat_read(char *, char **, off_t, int, int *, void *);
+extern const struct file_operations jfs_lmstats_proc_fops;
+extern const struct file_operations jfs_txstats_proc_fops;
+extern const struct file_operations jfs_mpstat_proc_fops;
+extern const struct file_operations jfs_xtstat_proc_fops;
#define INCREMENT(x) ((x)++)
#define DECREMENT(x) ((x)--)
diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h
index cdac2d5bafe..2545bb31723 100644
--- a/fs/jfs/jfs_dtree.h
+++ b/fs/jfs/jfs_dtree.h
@@ -243,9 +243,6 @@ typedef union {
#define JFS_REMOVE 3
#define JFS_RENAME 4
-#define DIRENTSIZ(namlen) \
- ( (sizeof(struct dirent) - 2*(JFS_NAME_MAX+1) + 2*((namlen)+1) + 3) &~ 3 )
-
/*
* Maximum file offset for directories.
*/
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 734ec916bea..d6363d8309d 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -1520,7 +1520,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip)
jfs_error(ip->i_sb,
"diAlloc: can't find free bit "
"in wmap");
- return EIO;
+ return -EIO;
}
/* determine the inode number within the
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 325a9679b95..cd2ec2988b5 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -69,6 +69,7 @@
#include <linux/freezer.h>
#include <linux/delay.h>
#include <linux/mutex.h>
+#include <linux/seq_file.h>
#include "jfs_incore.h"
#include "jfs_filsys.h"
#include "jfs_metapage.h"
@@ -2503,13 +2504,9 @@ exit:
}
#ifdef CONFIG_JFS_STATISTICS
-int jfs_lmstats_read(char *buffer, char **start, off_t offset, int length,
- int *eof, void *data)
+static int jfs_lmstats_proc_show(struct seq_file *m, void *v)
{
- int len = 0;
- off_t begin;
-
- len += sprintf(buffer,
+ seq_printf(m,
"JFS Logmgr stats\n"
"================\n"
"commits = %d\n"
@@ -2522,19 +2519,19 @@ int jfs_lmstats_read(char *buffer, char **start, off_t offset, int length,
lmStat.pagedone,
lmStat.full_page,
lmStat.partial_page);
+ return 0;
+}
- begin = offset;
- *start = buffer + begin;
- len -= begin;
-
- if (len > length)
- len = length;
- else
- *eof = 1;
-
- if (len < 0)
- len = 0;
-
- return len;
+static int jfs_lmstats_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, jfs_lmstats_proc_show, NULL);
}
+
+const struct file_operations jfs_lmstats_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = jfs_lmstats_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
#endif /* CONFIG_JFS_STATISTICS */
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index d1e64f2f2fc..c350057087d 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -19,10 +19,12 @@
#include <linux/fs.h>
#include <linux/mm.h>
+#include <linux/module.h>
#include <linux/bio.h>
#include <linux/init.h>
#include <linux/buffer_head.h>
#include <linux/mempool.h>
+#include <linux/seq_file.h>
#include "jfs_incore.h"
#include "jfs_superblock.h"
#include "jfs_filsys.h"
@@ -180,7 +182,7 @@ static inline void remove_metapage(struct page *page, struct metapage *mp)
#endif
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
{
struct metapage *mp = (struct metapage *)foo;
@@ -804,13 +806,9 @@ void __invalidate_metapages(struct inode *ip, s64 addr, int len)
}
#ifdef CONFIG_JFS_STATISTICS
-int jfs_mpstat_read(char *buffer, char **start, off_t offset, int length,
- int *eof, void *data)
+static int jfs_mpstat_proc_show(struct seq_file *m, void *v)
{
- int len = 0;
- off_t begin;
-
- len += sprintf(buffer,
+ seq_printf(m,
"JFS Metapage statistics\n"
"=======================\n"
"page allocations = %d\n"
@@ -819,19 +817,19 @@ int jfs_mpstat_read(char *buffer, char **start, off_t offset, int length,
mpStat.pagealloc,
mpStat.pagefree,
mpStat.lockwait);
+ return 0;
+}
- begin = offset;
- *start = buffer + begin;
- len -= begin;
-
- if (len > length)
- len = length;
- else
- *eof = 1;
-
- if (len < 0)
- len = 0;
-
- return len;
+static int jfs_mpstat_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, jfs_mpstat_proc_show, NULL);
}
+
+const struct file_operations jfs_mpstat_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = jfs_mpstat_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
#endif
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index e7c60ae6b5b..f26e4d03ada 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -49,6 +49,7 @@
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/kthread.h>
+#include <linux/seq_file.h>
#include "jfs_incore.h"
#include "jfs_inode.h"
#include "jfs_filsys.h"
@@ -3009,11 +3010,8 @@ int jfs_sync(void *arg)
}
#if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_DEBUG)
-int jfs_txanchor_read(char *buffer, char **start, off_t offset, int length,
- int *eof, void *data)
+static int jfs_txanchor_proc_show(struct seq_file *m, void *v)
{
- int len = 0;
- off_t begin;
char *freewait;
char *freelockwait;
char *lowlockwait;
@@ -3025,7 +3023,7 @@ int jfs_txanchor_read(char *buffer, char **start, off_t offset, int length,
lowlockwait =
waitqueue_active(&TxAnchor.lowlockwait) ? "active" : "empty";
- len += sprintf(buffer,
+ seq_printf(m,
"JFS TxAnchor\n"
"============\n"
"freetid = %d\n"
@@ -3044,31 +3042,27 @@ int jfs_txanchor_read(char *buffer, char **start, off_t offset, int length,
TxAnchor.tlocksInUse,
jfs_tlocks_low,
list_empty(&TxAnchor.unlock_queue) ? "" : "not ");
+ return 0;
+}
- begin = offset;
- *start = buffer + begin;
- len -= begin;
-
- if (len > length)
- len = length;
- else
- *eof = 1;
-
- if (len < 0)
- len = 0;
-
- return len;
+static int jfs_txanchor_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, jfs_txanchor_proc_show, NULL);
}
+
+const struct file_operations jfs_txanchor_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = jfs_txanchor_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
#endif
#if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_STATISTICS)
-int jfs_txstats_read(char *buffer, char **start, off_t offset, int length,
- int *eof, void *data)
+static int jfs_txstats_proc_show(struct seq_file *m, void *v)
{
- int len = 0;
- off_t begin;
-
- len += sprintf(buffer,
+ seq_printf(m,
"JFS TxStats\n"
"===========\n"
"calls to txBegin = %d\n"
@@ -3089,19 +3083,19 @@ int jfs_txstats_read(char *buffer, char **start, off_t offset, int length,
TxStat.txBeginAnon_lockslow,
TxStat.txLockAlloc,
TxStat.txLockAlloc_freelock);
+ return 0;
+}
- begin = offset;
- *start = buffer + begin;
- len -= begin;
-
- if (len > length)
- len = length;
- else
- *eof = 1;
-
- if (len < 0)
- len = 0;
-
- return len;
+static int jfs_txstats_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, jfs_txstats_proc_show, NULL);
}
+
+const struct file_operations jfs_txstats_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = jfs_txstats_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
#endif
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index 5a61ebf2cbc..ae3acafb447 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -20,7 +20,9 @@
*/
#include <linux/fs.h>
+#include <linux/module.h>
#include <linux/quotaops.h>
+#include <linux/seq_file.h>
#include "jfs_incore.h"
#include "jfs_filsys.h"
#include "jfs_metapage.h"
@@ -4134,13 +4136,9 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
}
#ifdef CONFIG_JFS_STATISTICS
-int jfs_xtstat_read(char *buffer, char **start, off_t offset, int length,
- int *eof, void *data)
+static int jfs_xtstat_proc_show(struct seq_file *m, void *v)
{
- int len = 0;
- off_t begin;
-
- len += sprintf(buffer,
+ seq_printf(m,
"JFS Xtree statistics\n"
"====================\n"
"searches = %d\n"
@@ -4149,19 +4147,19 @@ int jfs_xtstat_read(char *buffer, char **start, off_t offset, int length,
xtStat.search,
xtStat.fastSearch,
xtStat.split);
+ return 0;
+}
- begin = offset;
- *start = buffer + begin;
- len -= begin;
-
- if (len > length)
- len = length;
- else
- *eof = 1;
-
- if (len < 0)
- len = 0;
-
- return len;
+static int jfs_xtstat_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, jfs_xtstat_proc_show, NULL);
}
+
+const struct file_operations jfs_xtstat_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = jfs_xtstat_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
#endif
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 0ba6778edaa..2aba8238681 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1455,7 +1455,7 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc
free_UCSname(&key);
if (rc == -ENOENT) {
d_add(dentry, NULL);
- return ERR_PTR(0);
+ return NULL;
} else if (rc) {
jfs_err("jfs_lookup: dtSearch returned %d", rc);
return ERR_PTR(rc);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 50ea6545173..0dae345e481 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -22,6 +22,7 @@
#include <linux/parser.h>
#include <linux/completion.h>
#include <linux/vfs.h>
+#include <linux/quotaops.h>
#include <linux/mount.h>
#include <linux/moduleparam.h>
#include <linux/kthread.h>
@@ -198,7 +199,7 @@ enum {
Opt_usrquota, Opt_grpquota, Opt_uid, Opt_gid, Opt_umask
};
-static match_table_t tokens = {
+static const match_table_t tokens = {
{Opt_integrity, "integrity"},
{Opt_nointegrity, "nointegrity"},
{Opt_iocharset, "iocharset=%s"},
@@ -499,7 +500,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
inode = jfs_iget(sb, ROOT_I);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
- goto out_no_root;
+ goto out_no_rw;
}
sb->s_root = d_alloc_root(inode);
if (!sb->s_root)
@@ -521,9 +522,8 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
return 0;
out_no_root:
- jfs_err("jfs_read_super: get root inode failed");
- if (inode)
- iput(inode);
+ jfs_err("jfs_read_super: get root dentry failed");
+ iput(inode);
out_no_rw:
rc = jfs_umount(sb);
@@ -760,7 +760,7 @@ static struct file_system_type jfs_fs_type = {
.fs_flags = FS_REQUIRES_DEV,
};
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
{
struct jfs_inode_info *jfs_ip = (struct jfs_inode_info *) foo;
diff --git a/fs/libfs.c b/fs/libfs.c
index 892d41cb338..1add676a19d 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -216,8 +216,8 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
s->s_flags = MS_NOUSER;
s->s_maxbytes = ~0ULL;
- s->s_blocksize = 1024;
- s->s_blocksize_bits = 10;
+ s->s_blocksize = PAGE_SIZE;
+ s->s_blocksize_bits = PAGE_SHIFT;
s->s_magic = magic;
s->s_op = ops ? ops : &simple_super_operations;
s->s_time_gran = 1;
@@ -512,6 +512,20 @@ void simple_release_fs(struct vfsmount **mount, int *count)
mntput(mnt);
}
+/**
+ * simple_read_from_buffer - copy data from the buffer to user space
+ * @to: the user space buffer to read to
+ * @count: the maximum number of bytes to read
+ * @ppos: the current position in the buffer
+ * @from: the buffer to read from
+ * @available: the size of the buffer
+ *
+ * The simple_read_from_buffer() function reads up to @count bytes from the
+ * buffer @from at offset @ppos into the user space address starting at @to.
+ *
+ * On success, the number of bytes read is returned and the offset @ppos is
+ * advanced by this number, or negative value is returned on error.
+ **/
ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
const void *from, size_t available)
{
@@ -528,6 +542,20 @@ ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
return count;
}
+/**
+ * memory_read_from_buffer - copy data from the buffer
+ * @to: the kernel space buffer to read to
+ * @count: the maximum number of bytes to read
+ * @ppos: the current position in the buffer
+ * @from: the buffer to read from
+ * @available: the size of the buffer
+ *
+ * The memory_read_from_buffer() function reads up to @count bytes from the
+ * buffer @from at offset @ppos into the kernel space address starting at @to.
+ *
+ * On success, the number of bytes read is returned and the offset @ppos is
+ * advanced by this number, or negative value is returned on error.
+ **/
ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
const void *from, size_t available)
{
diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile
index 7725a0a9a55..97f6073ab33 100644
--- a/fs/lockd/Makefile
+++ b/fs/lockd/Makefile
@@ -5,6 +5,6 @@
obj-$(CONFIG_LOCKD) += lockd.o
lockd-objs-y := clntlock.o clntproc.o host.o svc.o svclock.o svcshare.o \
- svcproc.o svcsubs.o mon.o xdr.o
+ svcproc.o svcsubs.o mon.o xdr.o grace.o
lockd-objs-$(CONFIG_LOCKD_V4) += xdr4.o svc4proc.o
lockd-objs := $(lockd-objs-y)
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 0b45fd3a4bf..8307dd64bf4 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -54,14 +54,13 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init)
u32 nlm_version = (nlm_init->nfs_version == 2) ? 1 : 4;
int status;
- status = lockd_up(nlm_init->protocol);
+ status = lockd_up();
if (status < 0)
return ERR_PTR(status);
- host = nlmclnt_lookup_host((struct sockaddr_in *)nlm_init->address,
+ host = nlmclnt_lookup_host(nlm_init->address, nlm_init->addrlen,
nlm_init->protocol, nlm_version,
- nlm_init->hostname,
- strlen(nlm_init->hostname));
+ nlm_init->hostname);
if (host == NULL) {
lockd_down();
return ERR_PTR(-ENOLCK);
@@ -142,7 +141,7 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout)
/*
* The server lockd has called us back to tell us the lock was granted
*/
-__be32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock)
+__be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
{
const struct file_lock *fl = &lock->fl;
const struct nfs_fh *fh = &lock->fh;
@@ -166,7 +165,7 @@ __be32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock
*/
if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid)
continue;
- if (!nlm_cmp_addr(&block->b_host->h_addr, addr))
+ if (!nlm_cmp_addr(nlm_addr(block->b_host), addr))
continue;
if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0)
continue;
@@ -216,7 +215,7 @@ reclaimer(void *ptr)
/* This one ensures that our parent doesn't terminate while the
* reclaim is in progress */
lock_kernel();
- lockd_up(0); /* note: this cannot fail as lockd is already running */
+ lockd_up(); /* note: this cannot fail as lockd is already running */
dprintk("lockd: reclaiming locks for host %s\n", host->h_name);
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 5df517b81f3..31668b690e0 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -224,7 +224,9 @@ void nlm_release_call(struct nlm_rqst *call)
static void nlmclnt_rpc_release(void *data)
{
+ lock_kernel();
nlm_release_call(data);
+ unlock_kernel();
}
static int nlm_wait_on_grace(wait_queue_head_t *queue)
@@ -430,7 +432,7 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl)
* Report the conflicting lock back to the application.
*/
fl->fl_start = req->a_res.lock.fl.fl_start;
- fl->fl_end = req->a_res.lock.fl.fl_start;
+ fl->fl_end = req->a_res.lock.fl.fl_end;
fl->fl_type = req->a_res.lock.fl.fl_type;
fl->fl_pid = 0;
break;
@@ -580,7 +582,15 @@ again:
}
if (status < 0)
goto out_unlock;
- status = nlm_stat_to_errno(resp->status);
+ /*
+ * EAGAIN doesn't make sense for sleeping locks, and in some
+ * cases NLM_LCK_DENIED is returned for a permanent error. So
+ * turn it into an ENOLCK.
+ */
+ if (resp->status == nlm_lck_denied && (fl_flags & FL_SLEEP))
+ status = -ENOLCK;
+ else
+ status = nlm_stat_to_errno(resp->status);
out_unblock:
nlmclnt_finish_block(block);
out:
@@ -710,7 +720,9 @@ static void nlmclnt_unlock_callback(struct rpc_task *task, void *data)
die:
return;
retry_rebind:
+ lock_kernel();
nlm_rebind_host(req->a_host);
+ unlock_kernel();
retry_unlock:
rpc_restart_call(task);
}
@@ -788,7 +800,9 @@ retry_cancel:
/* Don't ever retry more than 3 times */
if (req->a_retries++ >= NLMCLNT_MAX_RETRIES)
goto die;
+ lock_kernel();
nlm_rebind_host(req->a_host);
+ unlock_kernel();
rpc_restart_call(task);
rpc_delay(task, 30 * HZ);
}
diff --git a/fs/lockd/grace.c b/fs/lockd/grace.c
new file mode 100644
index 00000000000..183cc1f0af1
--- /dev/null
+++ b/fs/lockd/grace.c
@@ -0,0 +1,59 @@
+/*
+ * Common code for control of lockd and nfsv4 grace periods.
+ */
+
+#include <linux/module.h>
+#include <linux/lockd/bind.h>
+
+static LIST_HEAD(grace_list);
+static DEFINE_SPINLOCK(grace_lock);
+
+/**
+ * locks_start_grace
+ * @lm: who this grace period is for
+ *
+ * A grace period is a period during which locks should not be given
+ * out. Currently grace periods are only enforced by the two lock
+ * managers (lockd and nfsd), using the locks_in_grace() function to
+ * check when they are in a grace period.
+ *
+ * This function is called to start a grace period.
+ */
+void locks_start_grace(struct lock_manager *lm)
+{
+ spin_lock(&grace_lock);
+ list_add(&lm->list, &grace_list);
+ spin_unlock(&grace_lock);
+}
+EXPORT_SYMBOL_GPL(locks_start_grace);
+
+/**
+ * locks_end_grace
+ * @lm: who this grace period is for
+ *
+ * Call this function to state that the given lock manager is ready to
+ * resume regular locking. The grace period will not end until all lock
+ * managers that called locks_start_grace() also call locks_end_grace().
+ * Note that callers count on it being safe to call this more than once,
+ * and the second call should be a no-op.
+ */
+void locks_end_grace(struct lock_manager *lm)
+{
+ spin_lock(&grace_lock);
+ list_del_init(&lm->list);
+ spin_unlock(&grace_lock);
+}
+EXPORT_SYMBOL_GPL(locks_end_grace);
+
+/**
+ * locks_in_grace
+ *
+ * Lock managers call this function to determine when it is OK for them
+ * to answer ordinary lock requests, and when they should accept only
+ * lock reclaims.
+ */
+int locks_in_grace(void)
+{
+ return !list_empty(&grace_list);
+}
+EXPORT_SYMBOL_GPL(locks_in_grace);
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index a17664c7eac..9fd8889097b 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -11,16 +11,17 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/in.h>
+#include <linux/in6.h>
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/svc.h>
#include <linux/lockd/lockd.h>
#include <linux/lockd/sm_inter.h>
#include <linux/mutex.h>
+#include <net/ipv6.h>
#define NLMDBG_FACILITY NLMDBG_HOSTCACHE
#define NLM_HOST_NRHASH 32
-#define NLM_ADDRHASH(addr) (ntohl(addr) & (NLM_HOST_NRHASH-1))
#define NLM_HOST_REBIND (60 * HZ)
#define NLM_HOST_EXPIRE (300 * HZ)
#define NLM_HOST_COLLECT (120 * HZ)
@@ -30,42 +31,115 @@ static unsigned long next_gc;
static int nrhosts;
static DEFINE_MUTEX(nlm_host_mutex);
-
static void nlm_gc_hosts(void);
-static struct nsm_handle * __nsm_find(const struct sockaddr_in *,
- const char *, unsigned int, int);
-static struct nsm_handle * nsm_find(const struct sockaddr_in *sin,
- const char *hostname,
- unsigned int hostname_len);
+static struct nsm_handle *nsm_find(const struct sockaddr *sap,
+ const size_t salen,
+ const char *hostname,
+ const size_t hostname_len,
+ const int create);
+
+struct nlm_lookup_host_info {
+ const int server; /* search for server|client */
+ const struct sockaddr *sap; /* address to search for */
+ const size_t salen; /* it's length */
+ const unsigned short protocol; /* transport to search for*/
+ const u32 version; /* NLM version to search for */
+ const char *hostname; /* remote's hostname */
+ const size_t hostname_len; /* it's length */
+ const struct sockaddr *src_sap; /* our address (optional) */
+ const size_t src_len; /* it's length */
+};
+
+/*
+ * Hash function must work well on big- and little-endian platforms
+ */
+static unsigned int __nlm_hash32(const __be32 n)
+{
+ unsigned int hash = (__force u32)n ^ ((__force u32)n >> 16);
+ return hash ^ (hash >> 8);
+}
+
+static unsigned int __nlm_hash_addr4(const struct sockaddr *sap)
+{
+ const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+ return __nlm_hash32(sin->sin_addr.s_addr);
+}
+
+static unsigned int __nlm_hash_addr6(const struct sockaddr *sap)
+{
+ const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+ const struct in6_addr addr = sin6->sin6_addr;
+ return __nlm_hash32(addr.s6_addr32[0]) ^
+ __nlm_hash32(addr.s6_addr32[1]) ^
+ __nlm_hash32(addr.s6_addr32[2]) ^
+ __nlm_hash32(addr.s6_addr32[3]);
+}
+
+static unsigned int nlm_hash_address(const struct sockaddr *sap)
+{
+ unsigned int hash;
+
+ switch (sap->sa_family) {
+ case AF_INET:
+ hash = __nlm_hash_addr4(sap);
+ break;
+ case AF_INET6:
+ hash = __nlm_hash_addr6(sap);
+ break;
+ default:
+ hash = 0;
+ }
+ return hash & (NLM_HOST_NRHASH - 1);
+}
+
+static void nlm_clear_port(struct sockaddr *sap)
+{
+ switch (sap->sa_family) {
+ case AF_INET:
+ ((struct sockaddr_in *)sap)->sin_port = 0;
+ break;
+ case AF_INET6:
+ ((struct sockaddr_in6 *)sap)->sin6_port = 0;
+ break;
+ }
+}
+
+static void nlm_display_address(const struct sockaddr *sap,
+ char *buf, const size_t len)
+{
+ const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+ const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+
+ switch (sap->sa_family) {
+ case AF_UNSPEC:
+ snprintf(buf, len, "unspecified");
+ break;
+ case AF_INET:
+ snprintf(buf, len, NIPQUAD_FMT, NIPQUAD(sin->sin_addr.s_addr));
+ break;
+ case AF_INET6:
+ if (ipv6_addr_v4mapped(&sin6->sin6_addr))
+ snprintf(buf, len, NIPQUAD_FMT,
+ NIPQUAD(sin6->sin6_addr.s6_addr32[3]));
+ else
+ snprintf(buf, len, NIP6_FMT, NIP6(sin6->sin6_addr));
+ break;
+ default:
+ snprintf(buf, len, "unsupported address family");
+ break;
+ }
+}
/*
* Common host lookup routine for server & client
*/
-static struct nlm_host *nlm_lookup_host(int server,
- const struct sockaddr_in *sin,
- int proto, u32 version,
- const char *hostname,
- unsigned int hostname_len,
- const struct sockaddr_in *ssin)
+static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
{
struct hlist_head *chain;
struct hlist_node *pos;
struct nlm_host *host;
struct nsm_handle *nsm = NULL;
- int hash;
-
- dprintk("lockd: nlm_lookup_host("NIPQUAD_FMT"->"NIPQUAD_FMT
- ", p=%d, v=%u, my role=%s, name=%.*s)\n",
- NIPQUAD(ssin->sin_addr.s_addr),
- NIPQUAD(sin->sin_addr.s_addr), proto, version,
- server? "server" : "client",
- hostname_len,
- hostname? hostname : "<none>");
-
- hash = NLM_ADDRHASH(sin->sin_addr.s_addr);
-
- /* Lock hash table */
mutex_lock(&nlm_host_mutex);
if (time_after_eq(jiffies, next_gc))
@@ -78,22 +152,22 @@ static struct nlm_host *nlm_lookup_host(int server,
* different NLM rpc_clients into one single nlm_host object.
* This would allow us to have one nlm_host per address.
*/
- chain = &nlm_hosts[hash];
+ chain = &nlm_hosts[nlm_hash_address(ni->sap)];
hlist_for_each_entry(host, pos, chain, h_hash) {
- if (!nlm_cmp_addr(&host->h_addr, sin))
+ if (!nlm_cmp_addr(nlm_addr(host), ni->sap))
continue;
/* See if we have an NSM handle for this client */
if (!nsm)
nsm = host->h_nsmhandle;
- if (host->h_proto != proto)
+ if (host->h_proto != ni->protocol)
continue;
- if (host->h_version != version)
+ if (host->h_version != ni->version)
continue;
- if (host->h_server != server)
+ if (host->h_server != ni->server)
continue;
- if (!nlm_cmp_addr(&host->h_saddr, ssin))
+ if (!nlm_cmp_addr(nlm_srcaddr(host), ni->src_sap))
continue;
/* Move to head of hash chain. */
@@ -101,30 +175,41 @@ static struct nlm_host *nlm_lookup_host(int server,
hlist_add_head(&host->h_hash, chain);
nlm_get_host(host);
+ dprintk("lockd: nlm_lookup_host found host %s (%s)\n",
+ host->h_name, host->h_addrbuf);
goto out;
}
- if (nsm)
- atomic_inc(&nsm->sm_count);
-
- host = NULL;
- /* Sadly, the host isn't in our hash table yet. See if
- * we have an NSM handle for it. If not, create one.
+ /*
+ * The host wasn't in our hash table. If we don't
+ * have an NSM handle for it yet, create one.
*/
- if (!nsm && !(nsm = nsm_find(sin, hostname, hostname_len)))
- goto out;
+ if (nsm)
+ atomic_inc(&nsm->sm_count);
+ else {
+ host = NULL;
+ nsm = nsm_find(ni->sap, ni->salen,
+ ni->hostname, ni->hostname_len, 1);
+ if (!nsm) {
+ dprintk("lockd: nlm_lookup_host failed; "
+ "no nsm handle\n");
+ goto out;
+ }
+ }
host = kzalloc(sizeof(*host), GFP_KERNEL);
if (!host) {
nsm_release(nsm);
+ dprintk("lockd: nlm_lookup_host failed; no memory\n");
goto out;
}
host->h_name = nsm->sm_name;
- host->h_addr = *sin;
- host->h_addr.sin_port = 0; /* ouch! */
- host->h_saddr = *ssin;
- host->h_version = version;
- host->h_proto = proto;
+ memcpy(nlm_addr(host), ni->sap, ni->salen);
+ host->h_addrlen = ni->salen;
+ nlm_clear_port(nlm_addr(host));
+ memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len);
+ host->h_version = ni->version;
+ host->h_proto = ni->protocol;
host->h_rpcclnt = NULL;
mutex_init(&host->h_mutex);
host->h_nextrebind = jiffies + NLM_HOST_REBIND;
@@ -135,7 +220,7 @@ static struct nlm_host *nlm_lookup_host(int server,
host->h_state = 0; /* pseudo NSM state */
host->h_nsmstate = 0; /* real NSM state */
host->h_nsmhandle = nsm;
- host->h_server = server;
+ host->h_server = ni->server;
hlist_add_head(&host->h_hash, chain);
INIT_LIST_HEAD(&host->h_lockowners);
spin_lock_init(&host->h_lock);
@@ -143,6 +228,15 @@ static struct nlm_host *nlm_lookup_host(int server,
INIT_LIST_HEAD(&host->h_reclaim);
nrhosts++;
+
+ nlm_display_address((struct sockaddr *)&host->h_addr,
+ host->h_addrbuf, sizeof(host->h_addrbuf));
+ nlm_display_address((struct sockaddr *)&host->h_srcaddr,
+ host->h_srcaddrbuf, sizeof(host->h_srcaddrbuf));
+
+ dprintk("lockd: nlm_lookup_host created host %s\n",
+ host->h_name);
+
out:
mutex_unlock(&nlm_host_mutex);
return host;
@@ -170,33 +264,103 @@ nlm_destroy_host(struct nlm_host *host)
kfree(host);
}
-/*
- * Find an NLM server handle in the cache. If there is none, create it.
+/**
+ * nlmclnt_lookup_host - Find an NLM host handle matching a remote server
+ * @sap: network address of server
+ * @salen: length of server address
+ * @protocol: transport protocol to use
+ * @version: NLM protocol version
+ * @hostname: '\0'-terminated hostname of server
+ *
+ * Returns an nlm_host structure that matches the passed-in
+ * [server address, transport protocol, NLM version, server hostname].
+ * If one doesn't already exist in the host cache, a new handle is
+ * created and returned.
*/
-struct nlm_host *nlmclnt_lookup_host(const struct sockaddr_in *sin,
- int proto, u32 version,
- const char *hostname,
- unsigned int hostname_len)
+struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
+ const size_t salen,
+ const unsigned short protocol,
+ const u32 version, const char *hostname)
{
- struct sockaddr_in ssin = {0};
-
- return nlm_lookup_host(0, sin, proto, version,
- hostname, hostname_len, &ssin);
+ const struct sockaddr source = {
+ .sa_family = AF_UNSPEC,
+ };
+ struct nlm_lookup_host_info ni = {
+ .server = 0,
+ .sap = sap,
+ .salen = salen,
+ .protocol = protocol,
+ .version = version,
+ .hostname = hostname,
+ .hostname_len = strlen(hostname),
+ .src_sap = &source,
+ .src_len = sizeof(source),
+ };
+
+ dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__,
+ (hostname ? hostname : "<none>"), version,
+ (protocol == IPPROTO_UDP ? "udp" : "tcp"));
+
+ return nlm_lookup_host(&ni);
}
-/*
- * Find an NLM client handle in the cache. If there is none, create it.
+/**
+ * nlmsvc_lookup_host - Find an NLM host handle matching a remote client
+ * @rqstp: incoming NLM request
+ * @hostname: name of client host
+ * @hostname_len: length of client hostname
+ *
+ * Returns an nlm_host structure that matches the [client address,
+ * transport protocol, NLM version, client hostname] of the passed-in
+ * NLM request. If one doesn't already exist in the host cache, a
+ * new handle is created and returned.
+ *
+ * Before possibly creating a new nlm_host, construct a sockaddr
+ * for a specific source address in case the local system has
+ * multiple network addresses. The family of the address in
+ * rq_daddr is guaranteed to be the same as the family of the
+ * address in rq_addr, so it's safe to use the same family for
+ * the source address.
*/
-struct nlm_host *
-nlmsvc_lookup_host(struct svc_rqst *rqstp,
- const char *hostname, unsigned int hostname_len)
+struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp,
+ const char *hostname,
+ const size_t hostname_len)
{
- struct sockaddr_in ssin = {0};
+ struct sockaddr_in sin = {
+ .sin_family = AF_INET,
+ };
+ struct sockaddr_in6 sin6 = {
+ .sin6_family = AF_INET6,
+ };
+ struct nlm_lookup_host_info ni = {
+ .server = 1,
+ .sap = svc_addr(rqstp),
+ .salen = rqstp->rq_addrlen,
+ .protocol = rqstp->rq_prot,
+ .version = rqstp->rq_vers,
+ .hostname = hostname,
+ .hostname_len = hostname_len,
+ .src_len = rqstp->rq_addrlen,
+ };
+
+ dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__,
+ (int)hostname_len, hostname, rqstp->rq_vers,
+ (rqstp->rq_prot == IPPROTO_UDP ? "udp" : "tcp"));
+
+ switch (ni.sap->sa_family) {
+ case AF_INET:
+ sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr;
+ ni.src_sap = (struct sockaddr *)&sin;
+ break;
+ case AF_INET6:
+ ipv6_addr_copy(&sin6.sin6_addr, &rqstp->rq_daddr.addr6);
+ ni.src_sap = (struct sockaddr *)&sin6;
+ break;
+ default:
+ return NULL;
+ }
- ssin.sin_addr = rqstp->rq_daddr.addr;
- return nlm_lookup_host(1, svc_addr_in(rqstp),
- rqstp->rq_prot, rqstp->rq_vers,
- hostname, hostname_len, &ssin);
+ return nlm_lookup_host(&ni);
}
/*
@@ -207,9 +371,8 @@ nlm_bind_host(struct nlm_host *host)
{
struct rpc_clnt *clnt;
- dprintk("lockd: nlm_bind_host("NIPQUAD_FMT"->"NIPQUAD_FMT")\n",
- NIPQUAD(host->h_saddr.sin_addr),
- NIPQUAD(host->h_addr.sin_addr));
+ dprintk("lockd: nlm_bind_host %s (%s), my addr=%s\n",
+ host->h_name, host->h_addrbuf, host->h_srcaddrbuf);
/* Lock host handle */
mutex_lock(&host->h_mutex);
@@ -221,7 +384,7 @@ nlm_bind_host(struct nlm_host *host)
if (time_after_eq(jiffies, host->h_nextrebind)) {
rpc_force_rebind(clnt);
host->h_nextrebind = jiffies + NLM_HOST_REBIND;
- dprintk("lockd: next rebind in %ld jiffies\n",
+ dprintk("lockd: next rebind in %lu jiffies\n",
host->h_nextrebind - jiffies);
}
} else {
@@ -234,9 +397,9 @@ nlm_bind_host(struct nlm_host *host)
};
struct rpc_create_args args = {
.protocol = host->h_proto,
- .address = (struct sockaddr *)&host->h_addr,
- .addrsize = sizeof(host->h_addr),
- .saddress = (struct sockaddr *)&host->h_saddr,
+ .address = nlm_addr(host),
+ .addrsize = host->h_addrlen,
+ .saddress = nlm_srcaddr(host),
.timeout = &timeparms,
.servername = host->h_name,
.program = &nlm_program,
@@ -324,12 +487,16 @@ void nlm_host_rebooted(const struct sockaddr_in *sin,
struct nsm_handle *nsm;
struct nlm_host *host;
- dprintk("lockd: nlm_host_rebooted(%s, %u.%u.%u.%u)\n",
- hostname, NIPQUAD(sin->sin_addr));
-
- /* Find the NSM handle for this peer */
- if (!(nsm = __nsm_find(sin, hostname, hostname_len, 0)))
+ nsm = nsm_find((struct sockaddr *)sin, sizeof(*sin),
+ hostname, hostname_len, 0);
+ if (nsm == NULL) {
+ dprintk("lockd: never saw rebooted peer '%.*s' before\n",
+ hostname_len, hostname);
return;
+ }
+
+ dprintk("lockd: nlm_host_rebooted(%.*s, %s)\n",
+ hostname_len, hostname, nsm->sm_addrbuf);
/* When reclaiming locks on this peer, make sure that
* we set up a new notification */
@@ -461,22 +628,23 @@ nlm_gc_hosts(void)
static LIST_HEAD(nsm_handles);
static DEFINE_SPINLOCK(nsm_lock);
-static struct nsm_handle *
-__nsm_find(const struct sockaddr_in *sin,
- const char *hostname, unsigned int hostname_len,
- int create)
+static struct nsm_handle *nsm_find(const struct sockaddr *sap,
+ const size_t salen,
+ const char *hostname,
+ const size_t hostname_len,
+ const int create)
{
struct nsm_handle *nsm = NULL;
struct nsm_handle *pos;
- if (!sin)
+ if (!sap)
return NULL;
if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
if (printk_ratelimit()) {
printk(KERN_WARNING "Invalid hostname \"%.*s\" "
"in NFS lock request\n",
- hostname_len, hostname);
+ (int)hostname_len, hostname);
}
return NULL;
}
@@ -489,7 +657,7 @@ retry:
if (strlen(pos->sm_name) != hostname_len
|| memcmp(pos->sm_name, hostname, hostname_len))
continue;
- } else if (!nlm_cmp_addr(&pos->sm_addr, sin))
+ } else if (!nlm_cmp_addr(nsm_addr(pos), sap))
continue;
atomic_inc(&pos->sm_count);
kfree(nsm);
@@ -509,10 +677,13 @@ retry:
if (nsm == NULL)
return NULL;
- nsm->sm_addr = *sin;
+ memcpy(nsm_addr(nsm), sap, salen);
+ nsm->sm_addrlen = salen;
nsm->sm_name = (char *) (nsm + 1);
memcpy(nsm->sm_name, hostname, hostname_len);
nsm->sm_name[hostname_len] = '\0';
+ nlm_display_address((struct sockaddr *)&nsm->sm_addr,
+ nsm->sm_addrbuf, sizeof(nsm->sm_addrbuf));
atomic_set(&nsm->sm_count, 1);
goto retry;
@@ -521,13 +692,6 @@ found:
return nsm;
}
-static struct nsm_handle *
-nsm_find(const struct sockaddr_in *sin, const char *hostname,
- unsigned int hostname_len)
-{
- return __nsm_find(sin, hostname, hostname_len, 1);
-}
-
/*
* Release an NSM handle
*/
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index e4d563543b1..4e7e958e8f6 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -51,7 +51,7 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res)
memset(&args, 0, sizeof(args));
args.mon_name = nsm->sm_name;
- args.addr = nsm->sm_addr.sin_addr.s_addr;
+ args.addr = nsm_addr_in(nsm)->sin_addr.s_addr;
args.prog = NLM_PROGRAM;
args.vers = 3;
args.proc = NLMPROC_NSM_NOTIFY;
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 2169af4d545..c631a83931c 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -50,8 +50,7 @@ EXPORT_SYMBOL(nlmsvc_ops);
static DEFINE_MUTEX(nlmsvc_mutex);
static unsigned int nlmsvc_users;
static struct task_struct *nlmsvc_task;
-static struct svc_serv *nlmsvc_serv;
-int nlmsvc_grace_period;
+static struct svc_rqst *nlmsvc_rqst;
unsigned long nlmsvc_timeout;
/*
@@ -85,27 +84,23 @@ static unsigned long get_lockd_grace_period(void)
return nlm_timeout * 5 * HZ;
}
-unsigned long get_nfs_grace_period(void)
-{
- unsigned long lockdgrace = get_lockd_grace_period();
- unsigned long nfsdgrace = 0;
-
- if (nlmsvc_ops)
- nfsdgrace = nlmsvc_ops->get_grace_period();
-
- return max(lockdgrace, nfsdgrace);
-}
-EXPORT_SYMBOL(get_nfs_grace_period);
+static struct lock_manager lockd_manager = {
+};
-static unsigned long set_grace_period(void)
+static void grace_ender(struct work_struct *not_used)
{
- nlmsvc_grace_period = 1;
- return get_nfs_grace_period() + jiffies;
+ locks_end_grace(&lockd_manager);
}
-static inline void clear_grace_period(void)
+static DECLARE_DELAYED_WORK(grace_period_end, grace_ender);
+
+static void set_grace_period(void)
{
- nlmsvc_grace_period = 0;
+ unsigned long grace_period = get_lockd_grace_period();
+
+ locks_start_grace(&lockd_manager);
+ cancel_delayed_work_sync(&grace_period_end);
+ schedule_delayed_work(&grace_period_end, grace_period);
}
/*
@@ -116,7 +111,6 @@ lockd(void *vrqstp)
{
int err = 0, preverr = 0;
struct svc_rqst *rqstp = vrqstp;
- unsigned long grace_period_expire;
/* try_to_freeze() is called from svc_recv() */
set_freezable();
@@ -139,7 +133,7 @@ lockd(void *vrqstp)
nlm_timeout = LOCKD_DFLT_TIMEO;
nlmsvc_timeout = nlm_timeout * HZ;
- grace_period_expire = set_grace_period();
+ set_grace_period();
/*
* The main request loop. We don't terminate until the last
@@ -153,21 +147,12 @@ lockd(void *vrqstp)
flush_signals(current);
if (nlmsvc_ops) {
nlmsvc_invalidate_all();
- grace_period_expire = set_grace_period();
+ set_grace_period();
}
continue;
}
- /*
- * Retry any blocked locks that have been notified by
- * the VFS. Don't do this during grace period.
- * (Theoretically, there shouldn't even be blocked locks
- * during grace period).
- */
- if (!nlmsvc_grace_period) {
- timeout = nlmsvc_retry_blocked();
- } else if (time_before(grace_period_expire, jiffies))
- clear_grace_period();
+ timeout = nlmsvc_retry_blocked();
/*
* Find a socket with data available and call its
@@ -194,43 +179,38 @@ lockd(void *vrqstp)
svc_process(rqstp);
}
-
flush_signals(current);
+ cancel_delayed_work_sync(&grace_period_end);
if (nlmsvc_ops)
nlmsvc_invalidate_all();
nlm_shutdown_hosts();
-
unlock_kernel();
-
- nlmsvc_task = NULL;
- nlmsvc_serv = NULL;
-
- /* Exit the RPC thread */
- svc_exit_thread(rqstp);
-
return 0;
}
/*
- * Make any sockets that are needed but not present.
- * If nlm_udpport or nlm_tcpport were set as module
- * options, make those sockets unconditionally
+ * Ensure there are active UDP and TCP listeners for lockd.
+ *
+ * Even if we have only TCP NFS mounts and/or TCP NFSDs, some
+ * local services (such as rpc.statd) still require UDP, and
+ * some NFS servers do not yet support NLM over TCP.
+ *
+ * Returns zero if all listeners are available; otherwise a
+ * negative errno value is returned.
*/
-static int make_socks(struct svc_serv *serv, int proto)
+static int make_socks(struct svc_serv *serv)
{
static int warned;
struct svc_xprt *xprt;
int err = 0;
- if (proto == IPPROTO_UDP || nlm_udpport) {
- xprt = svc_find_xprt(serv, "udp", 0, 0);
- if (!xprt)
- err = svc_create_xprt(serv, "udp", nlm_udpport,
- SVC_SOCK_DEFAULTS);
- else
- svc_xprt_put(xprt);
- }
- if (err >= 0 && (proto == IPPROTO_TCP || nlm_tcpport)) {
+ xprt = svc_find_xprt(serv, "udp", 0, 0);
+ if (!xprt)
+ err = svc_create_xprt(serv, "udp", nlm_udpport,
+ SVC_SOCK_DEFAULTS);
+ else
+ svc_xprt_put(xprt);
+ if (err >= 0) {
xprt = svc_find_xprt(serv, "tcp", 0, 0);
if (!xprt)
err = svc_create_xprt(serv, "tcp", nlm_tcpport,
@@ -250,22 +230,17 @@ static int make_socks(struct svc_serv *serv, int proto)
/*
* Bring up the lockd process if it's not already up.
*/
-int
-lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */
+int lockd_up(void)
{
struct svc_serv *serv;
- struct svc_rqst *rqstp;
int error = 0;
mutex_lock(&nlmsvc_mutex);
/*
* Check whether we're already up and running.
*/
- if (nlmsvc_serv) {
- if (proto)
- error = make_socks(nlmsvc_serv, proto);
+ if (nlmsvc_rqst)
goto out;
- }
/*
* Sanity check: if there's no pid,
@@ -276,21 +251,23 @@ lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */
"lockd_up: no pid, %d users??\n", nlmsvc_users);
error = -ENOMEM;
- serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL);
+ serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, AF_INET, NULL);
if (!serv) {
printk(KERN_WARNING "lockd_up: create service failed\n");
goto out;
}
- if ((error = make_socks(serv, proto)) < 0)
+ error = make_socks(serv);
+ if (error < 0)
goto destroy_and_out;
/*
* Create the kernel thread and wait for it to start.
*/
- rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]);
- if (IS_ERR(rqstp)) {
- error = PTR_ERR(rqstp);
+ nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0]);
+ if (IS_ERR(nlmsvc_rqst)) {
+ error = PTR_ERR(nlmsvc_rqst);
+ nlmsvc_rqst = NULL;
printk(KERN_WARNING
"lockd_up: svc_rqst allocation failed, error=%d\n",
error);
@@ -298,16 +275,15 @@ lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */
}
svc_sock_update_bufs(serv);
- nlmsvc_serv = rqstp->rq_server;
- nlmsvc_task = kthread_run(lockd, rqstp, serv->sv_name);
+ nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name);
if (IS_ERR(nlmsvc_task)) {
error = PTR_ERR(nlmsvc_task);
+ svc_exit_thread(nlmsvc_rqst);
nlmsvc_task = NULL;
- nlmsvc_serv = NULL;
+ nlmsvc_rqst = NULL;
printk(KERN_WARNING
"lockd_up: kthread_run failed, error=%d\n", error);
- svc_exit_thread(rqstp);
goto destroy_and_out;
}
@@ -346,6 +322,9 @@ lockd_down(void)
BUG();
}
kthread_stop(nlmsvc_task);
+ svc_exit_thread(nlmsvc_rqst);
+ nlmsvc_task = NULL;
+ nlmsvc_rqst = NULL;
out:
mutex_unlock(&nlmsvc_mutex);
}
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index 385437e3387..014f6ce4817 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -58,8 +58,7 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
return 0;
no_locks:
- if (host)
- nlm_release_host(host);
+ nlm_release_host(host);
if (error)
return error;
return nlm_lck_denied_nolocks;
@@ -84,23 +83,17 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
{
struct nlm_host *host;
struct nlm_file *file;
- int rc = rpc_success;
+ __be32 rc = rpc_success;
dprintk("lockd: TEST4 called\n");
resp->cookie = argp->cookie;
- /* Don't accept test requests during grace period */
- if (nlmsvc_grace_period) {
- resp->status = nlm_lck_denied_grace_period;
- return rc;
- }
-
/* Obtain client and file */
if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
/* Now check for conflicting locks */
- resp->status = nlmsvc_testlock(rqstp, file, &argp->lock, &resp->lock, &resp->cookie);
+ resp->status = nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie);
if (resp->status == nlm_drop_reply)
rc = rpc_drop_reply;
else
@@ -117,18 +110,12 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
{
struct nlm_host *host;
struct nlm_file *file;
- int rc = rpc_success;
+ __be32 rc = rpc_success;
dprintk("lockd: LOCK called\n");
resp->cookie = argp->cookie;
- /* Don't accept new lock requests during grace period */
- if (nlmsvc_grace_period && !argp->reclaim) {
- resp->status = nlm_lck_denied_grace_period;
- return rc;
- }
-
/* Obtain client and file */
if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
@@ -146,8 +133,9 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
#endif
/* Now try to lock the file */
- resp->status = nlmsvc_lock(rqstp, file, &argp->lock,
- argp->block, &argp->cookie);
+ resp->status = nlmsvc_lock(rqstp, file, host, &argp->lock,
+ argp->block, &argp->cookie,
+ argp->reclaim);
if (resp->status == nlm_drop_reply)
rc = rpc_drop_reply;
else
@@ -170,7 +158,7 @@ nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
/* Don't accept requests during grace period */
- if (nlmsvc_grace_period) {
+ if (locks_in_grace()) {
resp->status = nlm_lck_denied_grace_period;
return rpc_success;
}
@@ -203,7 +191,7 @@ nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
/* Don't accept new lock requests during grace period */
- if (nlmsvc_grace_period) {
+ if (locks_in_grace()) {
resp->status = nlm_lck_denied_grace_period;
return rpc_success;
}
@@ -232,7 +220,7 @@ nlm4svc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
dprintk("lockd: GRANTED called\n");
- resp->status = nlmclnt_grant(svc_addr_in(rqstp), &argp->lock);
+ resp->status = nlmclnt_grant(svc_addr(rqstp), &argp->lock);
dprintk("lockd: GRANTED status %d\n", ntohl(resp->status));
return rpc_success;
}
@@ -248,7 +236,9 @@ static void nlm4svc_callback_exit(struct rpc_task *task, void *data)
static void nlm4svc_callback_release(void *data)
{
+ lock_kernel();
nlm_release_call(data);
+ unlock_kernel();
}
static const struct rpc_call_ops nlm4svc_callback_ops = {
@@ -340,7 +330,7 @@ nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
/* Don't accept new lock requests during grace period */
- if (nlmsvc_grace_period && !argp->reclaim) {
+ if (locks_in_grace() && !argp->reclaim) {
resp->status = nlm_lck_denied_grace_period;
return rpc_success;
}
@@ -373,7 +363,7 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
/* Don't accept requests during grace period */
- if (nlmsvc_grace_period) {
+ if (locks_in_grace()) {
resp->status = nlm_lck_denied_grace_period;
return rpc_success;
}
@@ -431,11 +421,9 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
{
struct sockaddr_in saddr;
- memcpy(&saddr, svc_addr_in(rqstp), sizeof(saddr));
-
dprintk("lockd: SM_NOTIFY called\n");
- if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK)
- || ntohs(saddr.sin_port) >= 1024) {
+
+ if (!nlm_privileged_requester(rqstp)) {
char buf[RPC_MAX_ADDRBUFLEN];
printk(KERN_WARNING "lockd: rejected NSM callback from %s\n",
svc_print_addr(rqstp, buf, sizeof(buf)));
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 81aca859bfd..6063a8e4b9f 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -129,9 +129,9 @@ nlmsvc_lookup_block(struct nlm_file *file, struct nlm_lock *lock)
static inline int nlm_cookie_match(struct nlm_cookie *a, struct nlm_cookie *b)
{
- if(a->len != b->len)
+ if (a->len != b->len)
return 0;
- if(memcmp(a->data,b->data,a->len))
+ if (memcmp(a->data, b->data, a->len))
return 0;
return 1;
}
@@ -180,6 +180,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host,
struct nlm_block *block;
struct nlm_rqst *call = NULL;
+ nlm_get_host(host);
call = nlm_alloc_call(host);
if (call == NULL)
return NULL;
@@ -358,10 +359,10 @@ nlmsvc_defer_lock_rqst(struct svc_rqst *rqstp, struct nlm_block *block)
*/
__be32
nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
- struct nlm_lock *lock, int wait, struct nlm_cookie *cookie)
+ struct nlm_host *host, struct nlm_lock *lock, int wait,
+ struct nlm_cookie *cookie, int reclaim)
{
struct nlm_block *block = NULL;
- struct nlm_host *host;
int error;
__be32 ret;
@@ -373,11 +374,6 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
(long long)lock->fl.fl_end,
wait);
- /* Create host handle for callback */
- host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len);
- if (host == NULL)
- return nlm_lck_denied_nolocks;
-
/* Lock file against concurrent access */
mutex_lock(&file->f_mutex);
/* Get existing block (in case client is busy-waiting)
@@ -385,8 +381,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
*/
block = nlmsvc_lookup_block(file, lock);
if (block == NULL) {
- block = nlmsvc_create_block(rqstp, nlm_get_host(host), file,
- lock, cookie);
+ block = nlmsvc_create_block(rqstp, host, file, lock, cookie);
ret = nlm_lck_denied_nolocks;
if (block == NULL)
goto out;
@@ -411,20 +406,29 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
goto out;
}
+ if (locks_in_grace() && !reclaim) {
+ ret = nlm_lck_denied_grace_period;
+ goto out;
+ }
+ if (reclaim && !locks_in_grace()) {
+ ret = nlm_lck_denied_grace_period;
+ goto out;
+ }
+
if (!wait)
lock->fl.fl_flags &= ~FL_SLEEP;
error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL);
lock->fl.fl_flags &= ~FL_SLEEP;
dprintk("lockd: vfs_lock_file returned %d\n", error);
- switch(error) {
+ switch (error) {
case 0:
ret = nlm_granted;
goto out;
case -EAGAIN:
ret = nlm_lck_denied;
- break;
- case -EINPROGRESS:
+ goto out;
+ case FILE_LOCK_DEFERRED:
if (wait)
break;
/* Filesystem lock operation is in progress
@@ -439,10 +443,6 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
goto out;
}
- ret = nlm_lck_denied;
- if (!wait)
- goto out;
-
ret = nlm_lck_blocked;
/* Append to list of blocked */
@@ -450,7 +450,6 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
out:
mutex_unlock(&file->f_mutex);
nlmsvc_release_block(block);
- nlm_release_host(host);
dprintk("lockd: nlmsvc_lock returned %u\n", ret);
return ret;
}
@@ -460,8 +459,8 @@ out:
*/
__be32
nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
- struct nlm_lock *lock, struct nlm_lock *conflock,
- struct nlm_cookie *cookie)
+ struct nlm_host *host, struct nlm_lock *lock,
+ struct nlm_lock *conflock, struct nlm_cookie *cookie)
{
struct nlm_block *block = NULL;
int error;
@@ -479,16 +478,9 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
if (block == NULL) {
struct file_lock *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
- struct nlm_host *host;
if (conf == NULL)
return nlm_granted;
- /* Create host handle for callback */
- host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len);
- if (host == NULL) {
- kfree(conf);
- return nlm_lck_denied_nolocks;
- }
block = nlmsvc_create_block(rqstp, host, file, lock, cookie);
if (block == NULL) {
kfree(conf);
@@ -519,8 +511,12 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
goto out;
}
+ if (locks_in_grace()) {
+ ret = nlm_lck_denied_grace_period;
+ goto out;
+ }
error = vfs_test_lock(file->f_file, &lock->fl);
- if (error == -EINPROGRESS) {
+ if (error == FILE_LOCK_DEFERRED) {
ret = nlmsvc_defer_lock_rqst(rqstp, block);
goto out;
}
@@ -599,6 +595,9 @@ nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock)
(long long)lock->fl.fl_start,
(long long)lock->fl.fl_end);
+ if (locks_in_grace())
+ return nlm_lck_denied_grace_period;
+
mutex_lock(&file->f_mutex);
block = nlmsvc_lookup_block(file, lock);
mutex_unlock(&file->f_mutex);
@@ -744,8 +743,7 @@ nlmsvc_grant_blocked(struct nlm_block *block)
switch (error) {
case 0:
break;
- case -EAGAIN:
- case -EINPROGRESS:
+ case FILE_LOCK_DEFERRED:
dprintk("lockd: lock still blocked error %d\n", error);
nlmsvc_insert_block(block, NLM_NEVER);
nlmsvc_release_block(block);
@@ -795,6 +793,7 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
dprintk("lockd: GRANT_MSG RPC callback\n");
+ lock_kernel();
/* if the block is not on a list at this point then it has
* been invalidated. Don't try to requeue it.
*
@@ -804,7 +803,7 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
* for nlm_blocked?
*/
if (list_empty(&block->b_list))
- return;
+ goto out;
/* Technically, we should down the file semaphore here. Since we
* move the block towards the head of the queue only, no harm
@@ -818,13 +817,17 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
}
nlmsvc_insert_block(block, timeout);
svc_wake_up(block->b_daemon);
+out:
+ unlock_kernel();
}
static void nlmsvc_grant_release(void *data)
{
struct nlm_rqst *call = data;
+ lock_kernel();
nlmsvc_release_block(call->a_block);
+ unlock_kernel();
}
static const struct rpc_call_ops nlmsvc_grant_ops = {
@@ -892,7 +895,7 @@ nlmsvc_retry_blocked(void)
if (block->b_when == NLM_NEVER)
break;
- if (time_after(block->b_when,jiffies)) {
+ if (time_after(block->b_when, jiffies)) {
timeout = block->b_when - jiffies;
break;
}
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 88379cc6e0b..548b0bb2b84 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -87,8 +87,7 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
return 0;
no_locks:
- if (host)
- nlm_release_host(host);
+ nlm_release_host(host);
if (error)
return error;
return nlm_lck_denied_nolocks;
@@ -113,23 +112,17 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp,
{
struct nlm_host *host;
struct nlm_file *file;
- int rc = rpc_success;
+ __be32 rc = rpc_success;
dprintk("lockd: TEST called\n");
resp->cookie = argp->cookie;
- /* Don't accept test requests during grace period */
- if (nlmsvc_grace_period) {
- resp->status = nlm_lck_denied_grace_period;
- return rc;
- }
-
/* Obtain client and file */
if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
/* Now check for conflicting locks */
- resp->status = cast_status(nlmsvc_testlock(rqstp, file, &argp->lock, &resp->lock, &resp->cookie));
+ resp->status = cast_status(nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie));
if (resp->status == nlm_drop_reply)
rc = rpc_drop_reply;
else
@@ -147,18 +140,12 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
{
struct nlm_host *host;
struct nlm_file *file;
- int rc = rpc_success;
+ __be32 rc = rpc_success;
dprintk("lockd: LOCK called\n");
resp->cookie = argp->cookie;
- /* Don't accept new lock requests during grace period */
- if (nlmsvc_grace_period && !argp->reclaim) {
- resp->status = nlm_lck_denied_grace_period;
- return rc;
- }
-
/* Obtain client and file */
if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
@@ -176,8 +163,9 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp,
#endif
/* Now try to lock the file */
- resp->status = cast_status(nlmsvc_lock(rqstp, file, &argp->lock,
- argp->block, &argp->cookie));
+ resp->status = cast_status(nlmsvc_lock(rqstp, file, host, &argp->lock,
+ argp->block, &argp->cookie,
+ argp->reclaim));
if (resp->status == nlm_drop_reply)
rc = rpc_drop_reply;
else
@@ -200,7 +188,7 @@ nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
/* Don't accept requests during grace period */
- if (nlmsvc_grace_period) {
+ if (locks_in_grace()) {
resp->status = nlm_lck_denied_grace_period;
return rpc_success;
}
@@ -233,7 +221,7 @@ nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
/* Don't accept new lock requests during grace period */
- if (nlmsvc_grace_period) {
+ if (locks_in_grace()) {
resp->status = nlm_lck_denied_grace_period;
return rpc_success;
}
@@ -262,7 +250,7 @@ nlmsvc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
dprintk("lockd: GRANTED called\n");
- resp->status = nlmclnt_grant(svc_addr_in(rqstp), &argp->lock);
+ resp->status = nlmclnt_grant(svc_addr(rqstp), &argp->lock);
dprintk("lockd: GRANTED status %d\n", ntohl(resp->status));
return rpc_success;
}
@@ -278,7 +266,9 @@ static void nlmsvc_callback_exit(struct rpc_task *task, void *data)
static void nlmsvc_callback_release(void *data)
{
+ lock_kernel();
nlm_release_call(data);
+ unlock_kernel();
}
static const struct rpc_call_ops nlmsvc_callback_ops = {
@@ -372,7 +362,7 @@ nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
/* Don't accept new lock requests during grace period */
- if (nlmsvc_grace_period && !argp->reclaim) {
+ if (locks_in_grace() && !argp->reclaim) {
resp->status = nlm_lck_denied_grace_period;
return rpc_success;
}
@@ -405,7 +395,7 @@ nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp,
resp->cookie = argp->cookie;
/* Don't accept requests during grace period */
- if (nlmsvc_grace_period) {
+ if (locks_in_grace()) {
resp->status = nlm_lck_denied_grace_period;
return rpc_success;
}
@@ -463,11 +453,9 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
{
struct sockaddr_in saddr;
- memcpy(&saddr, svc_addr_in(rqstp), sizeof(saddr));
-
dprintk("lockd: SM_NOTIFY called\n");
- if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK)
- || ntohs(saddr.sin_port) >= 1024) {
+
+ if (!nlm_privileged_requester(rqstp)) {
char buf[RPC_MAX_ADDRBUFLEN];
printk(KERN_WARNING "lockd: rejected NSM callback from %s\n",
svc_print_addr(rqstp, buf, sizeof(buf)));
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index d1c48b539df..34c2766e27c 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -373,13 +373,16 @@ nlmsvc_free_host_resources(struct nlm_host *host)
}
}
-/*
- * Remove all locks held for clients
+/**
+ * nlmsvc_invalidate_all - remove all locks held for clients
+ *
+ * Release all locks held by NFS clients.
+ *
*/
void
nlmsvc_invalidate_all(void)
{
- /* Release all locks held by NFS clients.
+ /*
* Previously, the code would call
* nlmsvc_free_host_resources for each client in
* turn, which is about as inefficient as it gets.
@@ -396,6 +399,12 @@ nlmsvc_match_sb(void *datap, struct nlm_file *file)
return sb == file->f_file->f_path.mnt->mnt_sb;
}
+/**
+ * nlmsvc_unlock_all_by_sb - release locks held on this file system
+ * @sb: super block
+ *
+ * Release all locks held by clients accessing this file system.
+ */
int
nlmsvc_unlock_all_by_sb(struct super_block *sb)
{
@@ -409,17 +418,22 @@ EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_sb);
static int
nlmsvc_match_ip(void *datap, struct nlm_host *host)
{
- __be32 *server_addr = datap;
-
- return host->h_saddr.sin_addr.s_addr == *server_addr;
+ return nlm_cmp_addr(nlm_srcaddr(host), datap);
}
+/**
+ * nlmsvc_unlock_all_by_ip - release local locks by IP address
+ * @server_addr: server's IP address as seen by clients
+ *
+ * Release all locks held by clients accessing this host
+ * via the passed in IP address.
+ */
int
-nlmsvc_unlock_all_by_ip(__be32 server_addr)
+nlmsvc_unlock_all_by_ip(struct sockaddr *server_addr)
{
int ret;
- ret = nlm_traverse_files(&server_addr, nlmsvc_match_ip, NULL);
- return ret ? -EIO : 0;
+ ret = nlm_traverse_files(server_addr, nlmsvc_match_ip, NULL);
+ return ret ? -EIO : 0;
}
EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_ip);
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 3e459e18cc3..1f226290c67 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -351,8 +351,6 @@ nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp)
argp->state = ntohl(*p++);
/* Preserve the address in network byte order */
argp->addr = *p++;
- argp->vers = *p++;
- argp->proto = *p++;
return xdr_argsize_check(rqstp, p);
}
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 43ff9397e6c..50c493a8ad8 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -358,8 +358,6 @@ nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp
argp->state = ntohl(*p++);
/* Preserve the address in network byte order */
argp->addr = *p++;
- argp->vers = *p++;
- argp->proto = *p++;
return xdr_argsize_check(rqstp, p);
}
diff --git a/fs/locks.c b/fs/locks.c
index 11dbf08651b..5eb259e3cd3 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -201,7 +201,7 @@ EXPORT_SYMBOL(locks_init_lock);
* Initialises the fields of the file lock which are invariant for
* free file_locks.
*/
-static void init_once(struct kmem_cache *cache, void *foo)
+static void init_once(void *foo)
{
struct file_lock *lock = (struct file_lock *) foo;
@@ -561,9 +561,6 @@ static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl)
/* insert into file's list */
fl->fl_next = *pos;
*pos = fl;
-
- if (fl->fl_ops && fl->fl_ops->fl_insert)
- fl->fl_ops->fl_insert(fl);
}
/*
@@ -586,9 +583,6 @@ static void locks_delete_lock(struct file_lock **thisfl_p)
fl->fl_fasync = NULL;
}
- if (fl->fl_ops && fl->fl_ops->fl_remove)
- fl->fl_ops->fl_remove(fl);
-
if (fl->fl_nspid) {
put_pid(fl->fl_nspid);
fl->fl_nspid = NULL;
@@ -785,8 +779,10 @@ find_conflict:
if (!flock_locks_conflict(request, fl))
continue;
error = -EAGAIN;
- if (request->fl_flags & FL_SLEEP)
- locks_insert_block(fl, request);
+ if (!(request->fl_flags & FL_SLEEP))
+ goto out;
+ error = FILE_LOCK_DEFERRED;
+ locks_insert_block(fl, request);
goto out;
}
if (request->fl_flags & FL_ACCESS)
@@ -842,7 +838,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
error = -EDEADLK;
if (posix_locks_deadlock(request, fl))
goto out;
- error = -EAGAIN;
+ error = FILE_LOCK_DEFERRED;
locks_insert_block(fl, request);
goto out;
}
@@ -1041,7 +1037,7 @@ int posix_lock_file_wait(struct file *filp, struct file_lock *fl)
might_sleep ();
for (;;) {
error = posix_lock_file(filp, fl, NULL);
- if ((error != -EAGAIN) || !(fl->fl_flags & FL_SLEEP))
+ if (error != FILE_LOCK_DEFERRED)
break;
error = wait_event_interruptible(fl->fl_wait, !fl->fl_next);
if (!error)
@@ -1113,9 +1109,7 @@ int locks_mandatory_area(int read_write, struct inode *inode,
for (;;) {
error = __posix_lock_file(inode, &fl, NULL);
- if (error != -EAGAIN)
- break;
- if (!(fl.fl_flags & FL_SLEEP))
+ if (error != FILE_LOCK_DEFERRED)
break;
error = wait_event_interruptible(fl.fl_wait, !fl.fl_next);
if (!error) {
@@ -1537,7 +1531,7 @@ int flock_lock_file_wait(struct file *filp, struct file_lock *fl)
might_sleep();
for (;;) {
error = flock_lock_file(filp, fl);
- if ((error != -EAGAIN) || !(fl->fl_flags & FL_SLEEP))
+ if (error != FILE_LOCK_DEFERRED)
break;
error = wait_event_interruptible(fl->fl_wait, !fl->fl_next);
if (!error)
@@ -1722,17 +1716,17 @@ out:
* fl_grant is set. Callers expecting ->lock() to return asynchronously
* will only use F_SETLK, not F_SETLKW; they will set FL_SLEEP if (and only if)
* the request is for a blocking lock. When ->lock() does return asynchronously,
- * it must return -EINPROGRESS, and call ->fl_grant() when the lock
+ * it must return FILE_LOCK_DEFERRED, and call ->fl_grant() when the lock
* request completes.
* If the request is for non-blocking lock the file system should return
- * -EINPROGRESS then try to get the lock and call the callback routine with
- * the result. If the request timed out the callback routine will return a
+ * FILE_LOCK_DEFERRED then try to get the lock and call the callback routine
+ * with the result. If the request timed out the callback routine will return a
* nonzero return code and the file system should release the lock. The file
* system is also responsible to keep a corresponding posix lock when it
* grants a lock so the VFS can find out which locks are locally held and do
* the correct lock cleanup when required.
* The underlying filesystem must not drop the kernel lock or call
- * ->fl_grant() before returning to the caller with a -EINPROGRESS
+ * ->fl_grant() before returning to the caller with a FILE_LOCK_DEFERRED
* return code.
*/
int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf)
@@ -1744,6 +1738,30 @@ int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, str
}
EXPORT_SYMBOL_GPL(vfs_lock_file);
+static int do_lock_file_wait(struct file *filp, unsigned int cmd,
+ struct file_lock *fl)
+{
+ int error;
+
+ error = security_file_lock(filp, fl->fl_type);
+ if (error)
+ return error;
+
+ for (;;) {
+ error = vfs_lock_file(filp, cmd, fl, NULL);
+ if (error != FILE_LOCK_DEFERRED)
+ break;
+ error = wait_event_interruptible(fl->fl_wait, !fl->fl_next);
+ if (!error)
+ continue;
+
+ locks_delete_block(fl);
+ break;
+ }
+
+ return error;
+}
+
/* Apply the lock described by l to an open file descriptor.
* This implements both the F_SETLK and F_SETLKW commands of fcntl().
*/
@@ -1801,26 +1819,7 @@ again:
goto out;
}
- error = security_file_lock(filp, file_lock->fl_type);
- if (error)
- goto out;
-
- if (filp->f_op && filp->f_op->lock != NULL)
- error = filp->f_op->lock(filp, cmd, file_lock);
- else {
- for (;;) {
- error = posix_lock_file(filp, file_lock, NULL);
- if (error != -EAGAIN || cmd == F_SETLK)
- break;
- error = wait_event_interruptible(file_lock->fl_wait,
- !file_lock->fl_next);
- if (!error)
- continue;
-
- locks_delete_block(file_lock);
- break;
- }
- }
+ error = do_lock_file_wait(filp, cmd, file_lock);
/*
* Attempt to detect a close/fcntl race and recover by
@@ -1938,26 +1937,7 @@ again:
goto out;
}
- error = security_file_lock(filp, file_lock->fl_type);
- if (error)
- goto out;
-
- if (filp->f_op && filp->f_op->lock != NULL)
- error = filp->f_op->lock(filp, cmd, file_lock);
- else {
- for (;;) {
- error = posix_lock_file(filp, file_lock, NULL);
- if (error != -EAGAIN || cmd == F_SETLK64)
- break;
- error = wait_event_interruptible(file_lock->fl_wait,
- !file_lock->fl_next);
- if (!error)
- continue;
-
- locks_delete_block(file_lock);
- break;
- }
- }
+ error = do_lock_file_wait(filp, cmd, file_lock);
/*
* Attempt to detect a close/fcntl race and recover by
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 84f6242ba6f..d1d1eb84679 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -68,7 +68,7 @@ static void minix_destroy_inode(struct inode *inode)
kmem_cache_free(minix_inode_cachep, minix_i(inode));
}
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
{
struct minix_inode_info *ei = (struct minix_inode_info *) foo;
@@ -256,9 +256,6 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
if (!s->s_root)
goto out_iput;
- if (!NO_TRUNCATE)
- s->s_root->d_op = &minix_dentry_operations;
-
if (!(s->s_flags & MS_RDONLY)) {
if (sbi->s_version != MINIX_V3) /* s_state is now out from V3 sb */
ms->s_state &= ~MINIX_VALID_FS;
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index 326edfe9610..e6a0b193bea 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -2,11 +2,6 @@
#include <linux/pagemap.h>
#include <linux/minix_fs.h>
-/*
- * change the define below to 0 if you want names > info->s_namelen chars to be
- * truncated. Else they will be disallowed (ENAMETOOLONG).
- */
-#define NO_TRUNCATE 1
#define INODE_VERSION(inode) minix_sb(inode->i_sb)->s_version
#define MINIX_V1 0x0001 /* original minix fs */
#define MINIX_V2 0x0002 /* minix V2 fs */
@@ -83,7 +78,6 @@ extern const struct inode_operations minix_file_inode_operations;
extern const struct inode_operations minix_dir_inode_operations;
extern const struct file_operations minix_file_operations;
extern const struct file_operations minix_dir_operations;
-extern struct dentry_operations minix_dentry_operations;
static inline struct minix_sb_info *minix_sb(struct super_block *sb)
{
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 102241bc9c7..32b131cd612 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -18,30 +18,6 @@ static int add_nondir(struct dentry *dentry, struct inode *inode)
return err;
}
-static int minix_hash(struct dentry *dentry, struct qstr *qstr)
-{
- unsigned long hash;
- int i;
- const unsigned char *name;
-
- i = minix_sb(dentry->d_inode->i_sb)->s_namelen;
- if (i >= qstr->len)
- return 0;
- /* Truncate the name in place, avoids having to define a compare
- function. */
- qstr->len = i;
- name = qstr->name;
- hash = init_name_hash();
- while (i--)
- hash = partial_name_hash(*name++, hash);
- qstr->hash = end_name_hash(hash);
- return 0;
-}
-
-struct dentry_operations minix_dentry_operations = {
- .d_hash = minix_hash,
-};
-
static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
{
struct inode * inode = NULL;
diff --git a/fs/mpage.c b/fs/mpage.c
index 235e4d3873a..dbcc7af76a1 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -82,7 +82,7 @@ static void mpage_end_io_write(struct bio *bio, int err)
bio_put(bio);
}
-static struct bio *mpage_bio_submit(int rw, struct bio *bio)
+struct bio *mpage_bio_submit(int rw, struct bio *bio)
{
bio->bi_end_io = mpage_end_io_read;
if (rw == WRITE)
@@ -90,6 +90,7 @@ static struct bio *mpage_bio_submit(int rw, struct bio *bio)
submit_bio(rw, bio);
return NULL;
}
+EXPORT_SYMBOL(mpage_bio_submit);
static struct bio *
mpage_alloc(struct block_device *bdev,
@@ -435,15 +436,9 @@ EXPORT_SYMBOL(mpage_readpage);
* written, so it can intelligently allocate a suitably-sized BIO. For now,
* just allocate full-size (16-page) BIOs.
*/
-struct mpage_data {
- struct bio *bio;
- sector_t last_block_in_bio;
- get_block_t *get_block;
- unsigned use_writepage;
-};
-static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
- void *data)
+int __mpage_writepage(struct page *page, struct writeback_control *wbc,
+ void *data)
{
struct mpage_data *mpd = data;
struct bio *bio = mpd->bio;
@@ -651,6 +646,7 @@ out:
mpd->bio = bio;
return ret;
}
+EXPORT_SYMBOL(__mpage_writepage);
/**
* mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c
index 05ff4f1d702..e844b9809d2 100644
--- a/fs/msdos/namei.c
+++ b/fs/msdos/namei.c
@@ -14,12 +14,7 @@
/* Characters that are undesirable in an MS-DOS file name */
static unsigned char bad_chars[] = "*?<>|\"";
-static unsigned char bad_if_strict_pc[] = "+=,; ";
-/* GEMDOS is less restrictive */
-static unsigned char bad_if_strict_atari[] = " ";
-
-#define bad_if_strict(opts) \
- ((opts)->atari ? bad_if_strict_atari : bad_if_strict_pc)
+static unsigned char bad_if_strict[] = "+=,; ";
/***** Formats an MS-DOS file name. Rejects invalid names. */
static int msdos_format_name(const unsigned char *name, int len,
@@ -40,21 +35,20 @@ static int msdos_format_name(const unsigned char *name, int len,
/* Get rid of dot - test for it elsewhere */
name++;
len--;
- } else if (!opts->atari)
+ } else
return -EINVAL;
}
/*
- * disallow names that _really_ start with a dot for MS-DOS,
- * GEMDOS does not care
+ * disallow names that _really_ start with a dot
*/
- space = !opts->atari;
+ space = 1;
c = 0;
for (walk = res; len && walk - res < 8; walk++) {
c = *name++;
len--;
if (opts->name_check != 'r' && strchr(bad_chars, c))
return -EINVAL;
- if (opts->name_check == 's' && strchr(bad_if_strict(opts), c))
+ if (opts->name_check == 's' && strchr(bad_if_strict, c))
return -EINVAL;
if (c >= 'A' && c <= 'Z' && opts->name_check == 's')
return -EINVAL;
@@ -94,7 +88,7 @@ static int msdos_format_name(const unsigned char *name, int len,
if (opts->name_check != 'r' && strchr(bad_chars, c))
return -EINVAL;
if (opts->name_check == 's' &&
- strchr(bad_if_strict(opts), c))
+ strchr(bad_if_strict, c))
return -EINVAL;
if (c < ' ' || c == ':' || c == '\\')
return -EINVAL;
@@ -214,7 +208,7 @@ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
dentry->d_op = &msdos_dentry_operations;
- lock_kernel();
+ lock_super(sb);
res = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
if (res == -ENOENT)
goto add;
@@ -232,7 +226,7 @@ add:
if (dentry)
dentry->d_op = &msdos_dentry_operations;
out:
- unlock_kernel();
+ unlock_super(sb);
if (!res)
return dentry;
return ERR_PTR(res);
@@ -243,6 +237,7 @@ static int msdos_add_entry(struct inode *dir, const unsigned char *name,
int is_dir, int is_hid, int cluster,
struct timespec *ts, struct fat_slot_info *sinfo)
{
+ struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb);
struct msdos_dir_entry de;
__le16 time, date;
int err;
@@ -252,7 +247,7 @@ static int msdos_add_entry(struct inode *dir, const unsigned char *name,
if (is_hid)
de.attr |= ATTR_HIDDEN;
de.lcase = 0;
- fat_date_unix2dos(ts->tv_sec, &time, &date);
+ fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc);
de.cdate = de.adate = 0;
de.ctime = 0;
de.ctime_cs = 0;
@@ -286,7 +281,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, int mode,
unsigned char msdos_name[MSDOS_NAME];
int err, is_hid;
- lock_kernel();
+ lock_super(sb);
err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
msdos_name, &MSDOS_SB(sb)->options);
@@ -315,7 +310,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, int mode,
d_instantiate(dentry, inode);
out:
- unlock_kernel();
+ unlock_super(sb);
if (!err)
err = fat_flush_inodes(sb, dir, inode);
return err;
@@ -324,11 +319,12 @@ out:
/***** Remove a directory */
static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
{
+ struct super_block *sb = dir->i_sb;
struct inode *inode = dentry->d_inode;
struct fat_slot_info sinfo;
int err;
- lock_kernel();
+ lock_super(sb);
/*
* Check whether the directory is not in use, then check
* whether it is empty.
@@ -349,9 +345,9 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
inode->i_ctime = CURRENT_TIME_SEC;
fat_detach(inode);
out:
- unlock_kernel();
+ unlock_super(sb);
if (!err)
- err = fat_flush_inodes(inode->i_sb, dir, inode);
+ err = fat_flush_inodes(sb, dir, inode);
return err;
}
@@ -366,7 +362,7 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
struct timespec ts;
int err, is_hid, cluster;
- lock_kernel();
+ lock_super(sb);
err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
msdos_name, &MSDOS_SB(sb)->options);
@@ -404,14 +400,14 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
d_instantiate(dentry, inode);
- unlock_kernel();
+ unlock_super(sb);
fat_flush_inodes(sb, dir, inode);
return 0;
out_free:
fat_free_clusters(dir, cluster);
out:
- unlock_kernel();
+ unlock_super(sb);
return err;
}
@@ -419,10 +415,11 @@ out:
static int msdos_unlink(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
+ struct super_block *sb= inode->i_sb;
struct fat_slot_info sinfo;
int err;
- lock_kernel();
+ lock_super(sb);
err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
if (err)
goto out;
@@ -434,9 +431,9 @@ static int msdos_unlink(struct inode *dir, struct dentry *dentry)
inode->i_ctime = CURRENT_TIME_SEC;
fat_detach(inode);
out:
- unlock_kernel();
+ unlock_super(sb);
if (!err)
- err = fat_flush_inodes(inode->i_sb, dir, inode);
+ err = fat_flush_inodes(sb, dir, inode);
return err;
}
@@ -618,10 +615,11 @@ error_inode:
static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
+ struct super_block *sb = old_dir->i_sb;
unsigned char old_msdos_name[MSDOS_NAME], new_msdos_name[MSDOS_NAME];
int err, is_hid;
- lock_kernel();
+ lock_super(sb);
err = msdos_format_name(old_dentry->d_name.name,
old_dentry->d_name.len, old_msdos_name,
@@ -640,9 +638,9 @@ static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
err = do_msdos_rename(old_dir, old_msdos_name, old_dentry,
new_dir, new_msdos_name, new_dentry, is_hid);
out:
- unlock_kernel();
+ unlock_super(sb);
if (!err)
- err = fat_flush_inodes(old_dir->i_sb, old_dir, new_dir);
+ err = fat_flush_inodes(sb, old_dir, new_dir);
return err;
}
diff --git a/fs/namei.c b/fs/namei.c
index c7e43536c49..4ea63ed5e79 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -31,7 +31,6 @@
#include <linux/file.h>
#include <linux/fcntl.h>
#include <linux/device_cgroup.h>
-#include <asm/namei.h>
#include <asm/uaccess.h>
#define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE])
@@ -185,6 +184,8 @@ int generic_permission(struct inode *inode, int mask,
{
umode_t mode = inode->i_mode;
+ mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
+
if (current->fsuid == inode->i_uid)
mode >>= 6;
else {
@@ -203,7 +204,7 @@ int generic_permission(struct inode *inode, int mask,
/*
* If the DACs are ok we don't need any capability check.
*/
- if (((mode & mask & (MAY_READ|MAY_WRITE|MAY_EXEC)) == mask))
+ if ((mask & ~mode) == 0)
return 0;
check_capabilities:
@@ -226,13 +227,9 @@ int generic_permission(struct inode *inode, int mask,
return -EACCES;
}
-int permission(struct inode *inode, int mask, struct nameidata *nd)
+int inode_permission(struct inode *inode, int mask)
{
- int retval, submask;
- struct vfsmount *mnt = NULL;
-
- if (nd)
- mnt = nd->path.mnt;
+ int retval;
if (mask & MAY_WRITE) {
umode_t mode = inode->i_mode;
@@ -251,19 +248,9 @@ int permission(struct inode *inode, int mask, struct nameidata *nd)
return -EACCES;
}
- if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) {
- /*
- * MAY_EXEC on regular files is denied if the fs is mounted
- * with the "noexec" flag.
- */
- if (mnt && (mnt->mnt_flags & MNT_NOEXEC))
- return -EACCES;
- }
-
/* Ordinary permission routines do not understand MAY_APPEND. */
- submask = mask & ~MAY_APPEND;
if (inode->i_op && inode->i_op->permission) {
- retval = inode->i_op->permission(inode, submask, nd);
+ retval = inode->i_op->permission(inode, mask);
if (!retval) {
/*
* Exec permission on a regular file is denied if none
@@ -277,7 +264,7 @@ int permission(struct inode *inode, int mask, struct nameidata *nd)
return -EACCES;
}
} else {
- retval = generic_permission(inode, submask, NULL);
+ retval = generic_permission(inode, mask, NULL);
}
if (retval)
return retval;
@@ -286,7 +273,8 @@ int permission(struct inode *inode, int mask, struct nameidata *nd)
if (retval)
return retval;
- return security_inode_permission(inode, mask, nd);
+ return security_inode_permission(inode,
+ mask & (MAY_READ|MAY_WRITE|MAY_EXEC|MAY_APPEND));
}
/**
@@ -301,7 +289,7 @@ int permission(struct inode *inode, int mask, struct nameidata *nd)
*/
int vfs_permission(struct nameidata *nd, int mask)
{
- return permission(nd->path.dentry->d_inode, mask, nd);
+ return inode_permission(nd->path.dentry->d_inode, mask);
}
/**
@@ -318,7 +306,7 @@ int vfs_permission(struct nameidata *nd, int mask)
*/
int file_permission(struct file *file, int mask)
{
- return permission(file->f_path.dentry->d_inode, mask, NULL);
+ return inode_permission(file->f_path.dentry->d_inode, mask);
}
/*
@@ -459,8 +447,7 @@ static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name,
* short-cut DAC fails, then call permission() to do more
* complete permission check.
*/
-static int exec_permission_lite(struct inode *inode,
- struct nameidata *nd)
+static int exec_permission_lite(struct inode *inode)
{
umode_t mode = inode->i_mode;
@@ -486,7 +473,7 @@ static int exec_permission_lite(struct inode *inode,
return -EACCES;
ok:
- return security_inode_permission(inode, MAY_EXEC, nd);
+ return security_inode_permission(inode, MAY_EXEC);
}
/*
@@ -519,7 +506,14 @@ static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, s
*/
result = d_lookup(parent, name);
if (!result) {
- struct dentry * dentry = d_alloc(parent, name);
+ struct dentry *dentry;
+
+ /* Don't create child dentry for a dead directory. */
+ result = ERR_PTR(-ENOENT);
+ if (IS_DEADDIR(dir))
+ goto out_unlock;
+
+ dentry = d_alloc(parent, name);
result = ERR_PTR(-ENOMEM);
if (dentry) {
result = dir->i_op->lookup(dir, dentry, nd);
@@ -528,6 +522,7 @@ static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, s
else
result = dentry;
}
+out_unlock:
mutex_unlock(&dir->i_mutex);
return result;
}
@@ -545,27 +540,16 @@ static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, s
return result;
}
-static int __emul_lookup_dentry(const char *, struct nameidata *);
-
/* SMP-safe */
-static __always_inline int
+static __always_inline void
walk_init_root(const char *name, struct nameidata *nd)
{
struct fs_struct *fs = current->fs;
read_lock(&fs->lock);
- if (fs->altroot.dentry && !(nd->flags & LOOKUP_NOALT)) {
- nd->path = fs->altroot;
- path_get(&fs->altroot);
- read_unlock(&fs->lock);
- if (__emul_lookup_dentry(name,nd))
- return 0;
- read_lock(&fs->lock);
- }
nd->path = fs->root;
path_get(&fs->root);
read_unlock(&fs->lock);
- return 1;
}
/*
@@ -581,15 +565,13 @@ static __always_inline int link_path_walk(const char *name, struct nameidata *nd
int result;
/* make sure the stuff we saved doesn't go away */
- dget(save.dentry);
- mntget(save.mnt);
+ path_get(&save);
result = __link_path_walk(name, nd);
if (result == -ESTALE) {
/* nd->path had been dropped */
nd->path = save;
- dget(nd->path.dentry);
- mntget(nd->path.mnt);
+ path_get(&nd->path);
nd->flags |= LOOKUP_REVAL;
result = __link_path_walk(name, nd);
}
@@ -608,12 +590,9 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
if (*link == '/') {
path_put(&nd->path);
- if (!walk_init_root(link, nd))
- /* weird __emul_prefix() stuff did it */
- goto out;
+ walk_init_root(link, nd);
}
res = link_path_walk(link, nd);
-out:
if (nd->depth || res || nd->last_type!=LAST_NORM)
return res;
/*
@@ -891,7 +870,7 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
unsigned int c;
nd->flags |= LOOKUP_CONTINUE;
- err = exec_permission_lite(inode, nd);
+ err = exec_permission_lite(inode);
if (err == -EAGAIN)
err = vfs_permission(nd, MAY_EXEC);
if (err)
@@ -1062,67 +1041,6 @@ static int path_walk(const char *name, struct nameidata *nd)
return link_path_walk(name, nd);
}
-/*
- * SMP-safe: Returns 1 and nd will have valid dentry and mnt, if
- * everything is done. Returns 0 and drops input nd, if lookup failed;
- */
-static int __emul_lookup_dentry(const char *name, struct nameidata *nd)
-{
- if (path_walk(name, nd))
- return 0; /* something went wrong... */
-
- if (!nd->path.dentry->d_inode ||
- S_ISDIR(nd->path.dentry->d_inode->i_mode)) {
- struct path old_path = nd->path;
- struct qstr last = nd->last;
- int last_type = nd->last_type;
- struct fs_struct *fs = current->fs;
-
- /*
- * NAME was not found in alternate root or it's a directory.
- * Try to find it in the normal root:
- */
- nd->last_type = LAST_ROOT;
- read_lock(&fs->lock);
- nd->path = fs->root;
- path_get(&fs->root);
- read_unlock(&fs->lock);
- if (path_walk(name, nd) == 0) {
- if (nd->path.dentry->d_inode) {
- path_put(&old_path);
- return 1;
- }
- path_put(&nd->path);
- }
- nd->path = old_path;
- nd->last = last;
- nd->last_type = last_type;
- }
- return 1;
-}
-
-void set_fs_altroot(void)
-{
- char *emul = __emul_prefix();
- struct nameidata nd;
- struct path path = {}, old_path;
- int err;
- struct fs_struct *fs = current->fs;
-
- if (!emul)
- goto set_it;
- err = path_lookup(emul, LOOKUP_FOLLOW|LOOKUP_DIRECTORY|LOOKUP_NOALT, &nd);
- if (!err)
- path = nd.path;
-set_it:
- write_lock(&fs->lock);
- old_path = fs->altroot;
- fs->altroot = path;
- write_unlock(&fs->lock);
- if (old_path.dentry)
- path_put(&old_path);
-}
-
/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
static int do_path_lookup(int dfd, const char *name,
unsigned int flags, struct nameidata *nd)
@@ -1138,14 +1056,6 @@ static int do_path_lookup(int dfd, const char *name,
if (*name=='/') {
read_lock(&fs->lock);
- if (fs->altroot.dentry && !(nd->flags & LOOKUP_NOALT)) {
- nd->path = fs->altroot;
- path_get(&fs->altroot);
- read_unlock(&fs->lock);
- if (__emul_lookup_dentry(name,nd))
- goto out; /* found in altroot */
- read_lock(&fs->lock);
- }
nd->path = fs->root;
path_get(&fs->root);
read_unlock(&fs->lock);
@@ -1179,7 +1089,6 @@ static int do_path_lookup(int dfd, const char *name,
}
retval = path_walk(name, nd);
-out:
if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
nd->path.dentry->d_inode))
audit_inode(name, nd->path.dentry);
@@ -1216,8 +1125,9 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
nd->flags = flags;
nd->depth = 0;
- nd->path.mnt = mntget(mnt);
- nd->path.dentry = dget(dentry);
+ nd->path.dentry = dentry;
+ nd->path.mnt = mnt;
+ path_get(&nd->path);
retval = path_walk(name, nd);
if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
@@ -1283,19 +1193,6 @@ static int path_lookup_create(int dfd, const char *name,
nd, open_flags, create_mode);
}
-int __user_path_lookup_open(const char __user *name, unsigned int lookup_flags,
- struct nameidata *nd, int open_flags)
-{
- char *tmp = getname(name);
- int err = PTR_ERR(tmp);
-
- if (!IS_ERR(tmp)) {
- err = __path_lookup_intent_open(AT_FDCWD, tmp, lookup_flags, nd, open_flags, 0);
- putname(tmp);
- }
- return err;
-}
-
static struct dentry *__lookup_hash(struct qstr *name,
struct dentry *base, struct nameidata *nd)
{
@@ -1318,7 +1215,14 @@ static struct dentry *__lookup_hash(struct qstr *name,
dentry = cached_lookup(base, name, nd);
if (!dentry) {
- struct dentry *new = d_alloc(base, name);
+ struct dentry *new;
+
+ /* Don't create child dentry for a dead directory. */
+ dentry = ERR_PTR(-ENOENT);
+ if (IS_DEADDIR(inode))
+ goto out;
+
+ new = d_alloc(base, name);
dentry = ERR_PTR(-ENOMEM);
if (!new)
goto out;
@@ -1341,7 +1245,7 @@ static struct dentry *lookup_hash(struct nameidata *nd)
{
int err;
- err = permission(nd->path.dentry->d_inode, MAY_EXEC, nd);
+ err = inode_permission(nd->path.dentry->d_inode, MAY_EXEC);
if (err)
return ERR_PTR(err);
return __lookup_hash(&nd->last, nd->path.dentry, nd);
@@ -1389,7 +1293,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
if (err)
return ERR_PTR(err);
- err = permission(base->d_inode, MAY_EXEC, NULL);
+ err = inode_permission(base->d_inode, MAY_EXEC);
if (err)
return ERR_PTR(err);
return __lookup_hash(&this, base, NULL);
@@ -1417,22 +1321,40 @@ struct dentry *lookup_one_noperm(const char *name, struct dentry *base)
return __lookup_hash(&this, base, NULL);
}
-int __user_walk_fd(int dfd, const char __user *name, unsigned flags,
- struct nameidata *nd)
+int user_path_at(int dfd, const char __user *name, unsigned flags,
+ struct path *path)
{
+ struct nameidata nd;
char *tmp = getname(name);
int err = PTR_ERR(tmp);
-
if (!IS_ERR(tmp)) {
- err = do_path_lookup(dfd, tmp, flags, nd);
+
+ BUG_ON(flags & LOOKUP_PARENT);
+
+ err = do_path_lookup(dfd, tmp, flags, &nd);
putname(tmp);
+ if (!err)
+ *path = nd.path;
}
return err;
}
-int __user_walk(const char __user *name, unsigned flags, struct nameidata *nd)
+static int user_path_parent(int dfd, const char __user *path,
+ struct nameidata *nd, char **name)
{
- return __user_walk_fd(AT_FDCWD, name, flags, nd);
+ char *s = getname(path);
+ int error;
+
+ if (IS_ERR(s))
+ return PTR_ERR(s);
+
+ error = do_path_lookup(dfd, s, LOOKUP_PARENT, nd);
+ if (error)
+ putname(s);
+ else
+ *name = s;
+
+ return error;
}
/*
@@ -1479,7 +1401,7 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
BUG_ON(victim->d_parent->d_inode != dir);
audit_inode_child(victim->d_name.name, victim, dir);
- error = permission(dir,MAY_WRITE | MAY_EXEC, NULL);
+ error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
if (error)
return error;
if (IS_APPEND(dir))
@@ -1509,14 +1431,13 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
* 3. We should have write and exec permissions on dir
* 4. We can't do it if dir is immutable (done in permission())
*/
-static inline int may_create(struct inode *dir, struct dentry *child,
- struct nameidata *nd)
+static inline int may_create(struct inode *dir, struct dentry *child)
{
if (child->d_inode)
return -EEXIST;
if (IS_DEADDIR(dir))
return -ENOENT;
- return permission(dir,MAY_WRITE | MAY_EXEC, nd);
+ return inode_permission(dir, MAY_WRITE | MAY_EXEC);
}
/*
@@ -1582,7 +1503,7 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
struct nameidata *nd)
{
- int error = may_create(dir, dentry, nd);
+ int error = may_create(dir, dentry);
if (error)
return error;
@@ -1756,7 +1677,7 @@ struct file *do_filp_open(int dfd, const char *pathname,
int will_write;
int flag = open_to_namei_flags(open_flag);
- acc_mode = ACC_MODE(flag);
+ acc_mode = MAY_OPEN | ACC_MODE(flag);
/* O_TRUNC implies we need access checks for write permissions */
if (flag & O_TRUNC)
@@ -2026,7 +1947,7 @@ EXPORT_SYMBOL_GPL(lookup_create);
int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
{
- int error = may_create(dir, dentry, NULL);
+ int error = may_create(dir, dentry);
if (error)
return error;
@@ -2072,20 +1993,18 @@ static int may_mknod(mode_t mode)
asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
unsigned dev)
{
- int error = 0;
- char * tmp;
- struct dentry * dentry;
+ int error;
+ char *tmp;
+ struct dentry *dentry;
struct nameidata nd;
if (S_ISDIR(mode))
return -EPERM;
- tmp = getname(filename);
- if (IS_ERR(tmp))
- return PTR_ERR(tmp);
- error = do_path_lookup(dfd, tmp, LOOKUP_PARENT, &nd);
+ error = user_path_parent(dfd, filename, &nd, &tmp);
if (error)
- goto out;
+ return error;
+
dentry = lookup_create(&nd, 0);
if (IS_ERR(dentry)) {
error = PTR_ERR(dentry);
@@ -2117,7 +2036,6 @@ out_dput:
out_unlock:
mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
path_put(&nd.path);
-out:
putname(tmp);
return error;
@@ -2130,7 +2048,7 @@ asmlinkage long sys_mknod(const char __user *filename, int mode, unsigned dev)
int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
{
- int error = may_create(dir, dentry, NULL);
+ int error = may_create(dir, dentry);
if (error)
return error;
@@ -2157,14 +2075,10 @@ asmlinkage long sys_mkdirat(int dfd, const char __user *pathname, int mode)
struct dentry *dentry;
struct nameidata nd;
- tmp = getname(pathname);
- error = PTR_ERR(tmp);
- if (IS_ERR(tmp))
+ error = user_path_parent(dfd, pathname, &nd, &tmp);
+ if (error)
goto out_err;
- error = do_path_lookup(dfd, tmp, LOOKUP_PARENT, &nd);
- if (error)
- goto out;
dentry = lookup_create(&nd, 1);
error = PTR_ERR(dentry);
if (IS_ERR(dentry))
@@ -2182,7 +2096,6 @@ out_dput:
out_unlock:
mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
path_put(&nd.path);
-out:
putname(tmp);
out_err:
return error;
@@ -2260,13 +2173,9 @@ static long do_rmdir(int dfd, const char __user *pathname)
struct dentry *dentry;
struct nameidata nd;
- name = getname(pathname);
- if(IS_ERR(name))
- return PTR_ERR(name);
-
- error = do_path_lookup(dfd, name, LOOKUP_PARENT, &nd);
+ error = user_path_parent(dfd, pathname, &nd, &name);
if (error)
- goto exit;
+ return error;
switch(nd.last_type) {
case LAST_DOTDOT:
@@ -2295,7 +2204,6 @@ exit2:
mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
exit1:
path_put(&nd.path);
-exit:
putname(name);
return error;
}
@@ -2344,19 +2252,16 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
*/
static long do_unlinkat(int dfd, const char __user *pathname)
{
- int error = 0;
- char * name;
+ int error;
+ char *name;
struct dentry *dentry;
struct nameidata nd;
struct inode *inode = NULL;
- name = getname(pathname);
- if(IS_ERR(name))
- return PTR_ERR(name);
-
- error = do_path_lookup(dfd, name, LOOKUP_PARENT, &nd);
+ error = user_path_parent(dfd, pathname, &nd, &name);
if (error)
- goto exit;
+ return error;
+
error = -EISDIR;
if (nd.last_type != LAST_NORM)
goto exit1;
@@ -2383,7 +2288,6 @@ static long do_unlinkat(int dfd, const char __user *pathname)
iput(inode); /* truncate the inode here */
exit1:
path_put(&nd.path);
-exit:
putname(name);
return error;
@@ -2409,9 +2313,9 @@ asmlinkage long sys_unlink(const char __user *pathname)
return do_unlinkat(AT_FDCWD, pathname);
}
-int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname, int mode)
+int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
{
- int error = may_create(dir, dentry, NULL);
+ int error = may_create(dir, dentry);
if (error)
return error;
@@ -2433,23 +2337,20 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname, i
asmlinkage long sys_symlinkat(const char __user *oldname,
int newdfd, const char __user *newname)
{
- int error = 0;
- char * from;
- char * to;
+ int error;
+ char *from;
+ char *to;
struct dentry *dentry;
struct nameidata nd;
from = getname(oldname);
- if(IS_ERR(from))
+ if (IS_ERR(from))
return PTR_ERR(from);
- to = getname(newname);
- error = PTR_ERR(to);
- if (IS_ERR(to))
- goto out_putname;
- error = do_path_lookup(newdfd, to, LOOKUP_PARENT, &nd);
+ error = user_path_parent(newdfd, newname, &nd, &to);
if (error)
- goto out;
+ goto out_putname;
+
dentry = lookup_create(&nd, 0);
error = PTR_ERR(dentry);
if (IS_ERR(dentry))
@@ -2458,14 +2359,13 @@ asmlinkage long sys_symlinkat(const char __user *oldname,
error = mnt_want_write(nd.path.mnt);
if (error)
goto out_dput;
- error = vfs_symlink(nd.path.dentry->d_inode, dentry, from, S_IALLUGO);
+ error = vfs_symlink(nd.path.dentry->d_inode, dentry, from);
mnt_drop_write(nd.path.mnt);
out_dput:
dput(dentry);
out_unlock:
mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
path_put(&nd.path);
-out:
putname(to);
out_putname:
putname(from);
@@ -2485,7 +2385,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
if (!inode)
return -ENOENT;
- error = may_create(dir, new_dentry, NULL);
+ error = may_create(dir, new_dentry);
if (error)
return error;
@@ -2499,19 +2399,19 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
return -EPERM;
if (!dir->i_op || !dir->i_op->link)
return -EPERM;
- if (S_ISDIR(old_dentry->d_inode->i_mode))
+ if (S_ISDIR(inode->i_mode))
return -EPERM;
error = security_inode_link(old_dentry, dir, new_dentry);
if (error)
return error;
- mutex_lock(&old_dentry->d_inode->i_mutex);
+ mutex_lock(&inode->i_mutex);
DQUOT_INIT(dir);
error = dir->i_op->link(old_dentry, dir, new_dentry);
- mutex_unlock(&old_dentry->d_inode->i_mutex);
+ mutex_unlock(&inode->i_mutex);
if (!error)
- fsnotify_link(dir, old_dentry->d_inode, new_dentry);
+ fsnotify_link(dir, inode, new_dentry);
return error;
}
@@ -2529,27 +2429,25 @@ asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
int flags)
{
struct dentry *new_dentry;
- struct nameidata nd, old_nd;
+ struct nameidata nd;
+ struct path old_path;
int error;
- char * to;
+ char *to;
if ((flags & ~AT_SYMLINK_FOLLOW) != 0)
return -EINVAL;
- to = getname(newname);
- if (IS_ERR(to))
- return PTR_ERR(to);
-
- error = __user_walk_fd(olddfd, oldname,
- flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0,
- &old_nd);
+ error = user_path_at(olddfd, oldname,
+ flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0,
+ &old_path);
if (error)
- goto exit;
- error = do_path_lookup(newdfd, to, LOOKUP_PARENT, &nd);
+ return error;
+
+ error = user_path_parent(newdfd, newname, &nd, &to);
if (error)
goto out;
error = -EXDEV;
- if (old_nd.path.mnt != nd.path.mnt)
+ if (old_path.mnt != nd.path.mnt)
goto out_release;
new_dentry = lookup_create(&nd, 0);
error = PTR_ERR(new_dentry);
@@ -2558,7 +2456,7 @@ asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
error = mnt_want_write(nd.path.mnt);
if (error)
goto out_dput;
- error = vfs_link(old_nd.path.dentry, nd.path.dentry->d_inode, new_dentry);
+ error = vfs_link(old_path.dentry, nd.path.dentry->d_inode, new_dentry);
mnt_drop_write(nd.path.mnt);
out_dput:
dput(new_dentry);
@@ -2566,10 +2464,9 @@ out_unlock:
mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
out_release:
path_put(&nd.path);
-out:
- path_put(&old_nd.path);
-exit:
putname(to);
+out:
+ path_put(&old_path);
return error;
}
@@ -2622,7 +2519,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
* we'll need to flip '..'.
*/
if (new_dir != old_dir) {
- error = permission(old_dentry->d_inode, MAY_WRITE, NULL);
+ error = inode_permission(old_dentry->d_inode, MAY_WRITE);
if (error)
return error;
}
@@ -2697,7 +2594,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
return error;
if (!new_dentry->d_inode)
- error = may_create(new_dir, new_dentry, NULL);
+ error = may_create(new_dir, new_dentry);
else
error = may_delete(new_dir, new_dentry, is_dir);
if (error)
@@ -2725,20 +2622,22 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
return error;
}
-static int do_rename(int olddfd, const char *oldname,
- int newdfd, const char *newname)
+asmlinkage long sys_renameat(int olddfd, const char __user *oldname,
+ int newdfd, const char __user *newname)
{
- int error = 0;
- struct dentry * old_dir, * new_dir;
- struct dentry * old_dentry, *new_dentry;
- struct dentry * trap;
+ struct dentry *old_dir, *new_dir;
+ struct dentry *old_dentry, *new_dentry;
+ struct dentry *trap;
struct nameidata oldnd, newnd;
+ char *from;
+ char *to;
+ int error;
- error = do_path_lookup(olddfd, oldname, LOOKUP_PARENT, &oldnd);
+ error = user_path_parent(olddfd, oldname, &oldnd, &from);
if (error)
goto exit;
- error = do_path_lookup(newdfd, newname, LOOKUP_PARENT, &newnd);
+ error = user_path_parent(newdfd, newname, &newnd, &to);
if (error)
goto exit1;
@@ -2800,29 +2699,11 @@ exit3:
unlock_rename(new_dir, old_dir);
exit2:
path_put(&newnd.path);
+ putname(to);
exit1:
path_put(&oldnd.path);
-exit:
- return error;
-}
-
-asmlinkage long sys_renameat(int olddfd, const char __user *oldname,
- int newdfd, const char __user *newname)
-{
- int error;
- char * from;
- char * to;
-
- from = getname(oldname);
- if(IS_ERR(from))
- return PTR_ERR(from);
- to = getname(newname);
- error = PTR_ERR(to);
- if (!IS_ERR(to)) {
- error = do_rename(olddfd, from, newdfd, to);
- putname(to);
- }
putname(from);
+exit:
return error;
}
@@ -2857,16 +2738,17 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
struct nameidata nd;
void *cookie;
+ int res;
nd.depth = 0;
cookie = dentry->d_inode->i_op->follow_link(dentry, &nd);
- if (!IS_ERR(cookie)) {
- int res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd));
- if (dentry->d_inode->i_op->put_link)
- dentry->d_inode->i_op->put_link(dentry, &nd, cookie);
- cookie = ERR_PTR(res);
- }
- return PTR_ERR(cookie);
+ if (IS_ERR(cookie))
+ return PTR_ERR(cookie);
+
+ res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd));
+ if (dentry->d_inode->i_op->put_link)
+ dentry->d_inode->i_op->put_link(dentry, &nd, cookie);
+ return res;
}
int vfs_follow_link(struct nameidata *nd, const char *link)
@@ -2959,8 +2841,7 @@ const struct inode_operations page_symlink_inode_operations = {
.put_link = page_put_link,
};
-EXPORT_SYMBOL(__user_walk);
-EXPORT_SYMBOL(__user_walk_fd);
+EXPORT_SYMBOL(user_path_at);
EXPORT_SYMBOL(follow_down);
EXPORT_SYMBOL(follow_up);
EXPORT_SYMBOL(get_write_access); /* binfmt_aout */
@@ -2975,7 +2856,7 @@ EXPORT_SYMBOL(page_symlink);
EXPORT_SYMBOL(page_symlink_inode_operations);
EXPORT_SYMBOL(path_lookup);
EXPORT_SYMBOL(vfs_path_lookup);
-EXPORT_SYMBOL(permission);
+EXPORT_SYMBOL(inode_permission);
EXPORT_SYMBOL(vfs_permission);
EXPORT_SYMBOL(file_permission);
EXPORT_SYMBOL(unlock_rename);
diff --git a/fs/namespace.c b/fs/namespace.c
index 4fc302c2a0e..6e283c93b50 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -112,9 +112,13 @@ struct vfsmount *alloc_vfsmnt(const char *name)
int err;
err = mnt_alloc_id(mnt);
- if (err) {
- kmem_cache_free(mnt_cache, mnt);
- return NULL;
+ if (err)
+ goto out_free_cache;
+
+ if (name) {
+ mnt->mnt_devname = kstrdup(name, GFP_KERNEL);
+ if (!mnt->mnt_devname)
+ goto out_free_id;
}
atomic_set(&mnt->mnt_count, 1);
@@ -127,16 +131,14 @@ struct vfsmount *alloc_vfsmnt(const char *name)
INIT_LIST_HEAD(&mnt->mnt_slave_list);
INIT_LIST_HEAD(&mnt->mnt_slave);
atomic_set(&mnt->__mnt_writers, 0);
- if (name) {
- int size = strlen(name) + 1;
- char *newname = kmalloc(size, GFP_KERNEL);
- if (newname) {
- memcpy(newname, name, size);
- mnt->mnt_devname = newname;
- }
- }
}
return mnt;
+
+out_free_id:
+ mnt_free_id(mnt);
+out_free_cache:
+ kmem_cache_free(mnt_cache, mnt);
+ return NULL;
}
/*
@@ -309,10 +311,9 @@ static void handle_write_count_underflow(struct vfsmount *mnt)
*/
if ((atomic_read(&mnt->__mnt_writers) < 0) &&
!(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
- printk(KERN_DEBUG "leak detected on mount(%p) writers "
+ WARN(1, KERN_DEBUG "leak detected on mount(%p) writers "
"count: %d\n",
mnt, atomic_read(&mnt->__mnt_writers));
- WARN_ON(1);
/* use the flag to keep the dmesg spam down */
mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
}
@@ -750,7 +751,7 @@ struct proc_fs_info {
const char *str;
};
-static void show_sb_opts(struct seq_file *m, struct super_block *sb)
+static int show_sb_opts(struct seq_file *m, struct super_block *sb)
{
static const struct proc_fs_info fs_info[] = {
{ MS_SYNCHRONOUS, ",sync" },
@@ -764,6 +765,8 @@ static void show_sb_opts(struct seq_file *m, struct super_block *sb)
if (sb->s_flags & fs_infop->flag)
seq_puts(m, fs_infop->str);
}
+
+ return security_sb_show_options(m, sb);
}
static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
@@ -806,11 +809,14 @@ static int show_vfsmnt(struct seq_file *m, void *v)
seq_putc(m, ' ');
show_type(m, mnt->mnt_sb);
seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw");
- show_sb_opts(m, mnt->mnt_sb);
+ err = show_sb_opts(m, mnt->mnt_sb);
+ if (err)
+ goto out;
show_mnt_opts(m, mnt);
if (mnt->mnt_sb->s_op->show_options)
err = mnt->mnt_sb->s_op->show_options(m, mnt);
seq_puts(m, " 0 0\n");
+out:
return err;
}
@@ -865,10 +871,13 @@ static int show_mountinfo(struct seq_file *m, void *v)
seq_putc(m, ' ');
mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw");
- show_sb_opts(m, sb);
+ err = show_sb_opts(m, sb);
+ if (err)
+ goto out;
if (sb->s_op->show_options)
err = sb->s_op->show_options(m, mnt);
seq_putc(m, '\n');
+out:
return err;
}
@@ -1121,27 +1130,27 @@ static int do_umount(struct vfsmount *mnt, int flags)
asmlinkage long sys_umount(char __user * name, int flags)
{
- struct nameidata nd;
+ struct path path;
int retval;
- retval = __user_walk(name, LOOKUP_FOLLOW, &nd);
+ retval = user_path(name, &path);
if (retval)
goto out;
retval = -EINVAL;
- if (nd.path.dentry != nd.path.mnt->mnt_root)
+ if (path.dentry != path.mnt->mnt_root)
goto dput_and_out;
- if (!check_mnt(nd.path.mnt))
+ if (!check_mnt(path.mnt))
goto dput_and_out;
retval = -EPERM;
if (!capable(CAP_SYS_ADMIN))
goto dput_and_out;
- retval = do_umount(nd.path.mnt, flags);
+ retval = do_umount(path.mnt, flags);
dput_and_out:
/* we mustn't call path_put() as that would clear mnt_expiry_mark */
- dput(nd.path.dentry);
- mntput_no_expire(nd.path.mnt);
+ dput(path.dentry);
+ mntput_no_expire(path.mnt);
out:
return retval;
}
@@ -1658,31 +1667,31 @@ static noinline int do_new_mount(struct nameidata *nd, char *type, int flags,
if (IS_ERR(mnt))
return PTR_ERR(mnt);
- return do_add_mount(mnt, nd, mnt_flags, NULL);
+ return do_add_mount(mnt, &nd->path, mnt_flags, NULL);
}
/*
* add a mount into a namespace's mount tree
* - provide the option of adding the new mount to an expiration list
*/
-int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd,
+int do_add_mount(struct vfsmount *newmnt, struct path *path,
int mnt_flags, struct list_head *fslist)
{
int err;
down_write(&namespace_sem);
/* Something was mounted here while we slept */
- while (d_mountpoint(nd->path.dentry) &&
- follow_down(&nd->path.mnt, &nd->path.dentry))
+ while (d_mountpoint(path->dentry) &&
+ follow_down(&path->mnt, &path->dentry))
;
err = -EINVAL;
- if (!check_mnt(nd->path.mnt))
+ if (!check_mnt(path->mnt))
goto unlock;
/* Refuse the same filesystem on the same mount point */
err = -EBUSY;
- if (nd->path.mnt->mnt_sb == newmnt->mnt_sb &&
- nd->path.mnt->mnt_root == nd->path.dentry)
+ if (path->mnt->mnt_sb == newmnt->mnt_sb &&
+ path->mnt->mnt_root == path->dentry)
goto unlock;
err = -EINVAL;
@@ -1690,7 +1699,7 @@ int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd,
goto unlock;
newmnt->mnt_flags = mnt_flags;
- if ((err = graft_tree(newmnt, &nd->path)))
+ if ((err = graft_tree(newmnt, path)))
goto unlock;
if (fslist) /* add to the specified expiration list */
@@ -1965,7 +1974,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
struct fs_struct *fs)
{
struct mnt_namespace *new_ns;
- struct vfsmount *rootmnt = NULL, *pwdmnt = NULL, *altrootmnt = NULL;
+ struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
struct vfsmount *p, *q;
new_ns = kmalloc(sizeof(struct mnt_namespace), GFP_KERNEL);
@@ -2008,10 +2017,6 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
pwdmnt = p;
fs->pwd.mnt = mntget(q);
}
- if (p == fs->altroot.mnt) {
- altrootmnt = p;
- fs->altroot.mnt = mntget(q);
- }
}
p = next_mnt(p, mnt_ns->root);
q = next_mnt(q, new_ns->root);
@@ -2022,8 +2027,6 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
mntput(rootmnt);
if (pwdmnt)
mntput(pwdmnt);
- if (altrootmnt)
- mntput(altrootmnt);
return new_ns;
}
@@ -2176,28 +2179,26 @@ asmlinkage long sys_pivot_root(const char __user * new_root,
const char __user * put_old)
{
struct vfsmount *tmp;
- struct nameidata new_nd, old_nd;
- struct path parent_path, root_parent, root;
+ struct path new, old, parent_path, root_parent, root;
int error;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- error = __user_walk(new_root, LOOKUP_FOLLOW | LOOKUP_DIRECTORY,
- &new_nd);
+ error = user_path_dir(new_root, &new);
if (error)
goto out0;
error = -EINVAL;
- if (!check_mnt(new_nd.path.mnt))
+ if (!check_mnt(new.mnt))
goto out1;
- error = __user_walk(put_old, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old_nd);
+ error = user_path_dir(put_old, &old);
if (error)
goto out1;
- error = security_sb_pivotroot(&old_nd.path, &new_nd.path);
+ error = security_sb_pivotroot(&old, &new);
if (error) {
- path_put(&old_nd.path);
+ path_put(&old);
goto out1;
}
@@ -2206,69 +2207,69 @@ asmlinkage long sys_pivot_root(const char __user * new_root,
path_get(&current->fs->root);
read_unlock(&current->fs->lock);
down_write(&namespace_sem);
- mutex_lock(&old_nd.path.dentry->d_inode->i_mutex);
+ mutex_lock(&old.dentry->d_inode->i_mutex);
error = -EINVAL;
- if (IS_MNT_SHARED(old_nd.path.mnt) ||
- IS_MNT_SHARED(new_nd.path.mnt->mnt_parent) ||
+ if (IS_MNT_SHARED(old.mnt) ||
+ IS_MNT_SHARED(new.mnt->mnt_parent) ||
IS_MNT_SHARED(root.mnt->mnt_parent))
goto out2;
if (!check_mnt(root.mnt))
goto out2;
error = -ENOENT;
- if (IS_DEADDIR(new_nd.path.dentry->d_inode))
+ if (IS_DEADDIR(new.dentry->d_inode))
goto out2;
- if (d_unhashed(new_nd.path.dentry) && !IS_ROOT(new_nd.path.dentry))
+ if (d_unhashed(new.dentry) && !IS_ROOT(new.dentry))
goto out2;
- if (d_unhashed(old_nd.path.dentry) && !IS_ROOT(old_nd.path.dentry))
+ if (d_unhashed(old.dentry) && !IS_ROOT(old.dentry))
goto out2;
error = -EBUSY;
- if (new_nd.path.mnt == root.mnt ||
- old_nd.path.mnt == root.mnt)
+ if (new.mnt == root.mnt ||
+ old.mnt == root.mnt)
goto out2; /* loop, on the same file system */
error = -EINVAL;
if (root.mnt->mnt_root != root.dentry)
goto out2; /* not a mountpoint */
if (root.mnt->mnt_parent == root.mnt)
goto out2; /* not attached */
- if (new_nd.path.mnt->mnt_root != new_nd.path.dentry)
+ if (new.mnt->mnt_root != new.dentry)
goto out2; /* not a mountpoint */
- if (new_nd.path.mnt->mnt_parent == new_nd.path.mnt)
+ if (new.mnt->mnt_parent == new.mnt)
goto out2; /* not attached */
/* make sure we can reach put_old from new_root */
- tmp = old_nd.path.mnt;
+ tmp = old.mnt;
spin_lock(&vfsmount_lock);
- if (tmp != new_nd.path.mnt) {
+ if (tmp != new.mnt) {
for (;;) {
if (tmp->mnt_parent == tmp)
goto out3; /* already mounted on put_old */
- if (tmp->mnt_parent == new_nd.path.mnt)
+ if (tmp->mnt_parent == new.mnt)
break;
tmp = tmp->mnt_parent;
}
- if (!is_subdir(tmp->mnt_mountpoint, new_nd.path.dentry))
+ if (!is_subdir(tmp->mnt_mountpoint, new.dentry))
goto out3;
- } else if (!is_subdir(old_nd.path.dentry, new_nd.path.dentry))
+ } else if (!is_subdir(old.dentry, new.dentry))
goto out3;
- detach_mnt(new_nd.path.mnt, &parent_path);
+ detach_mnt(new.mnt, &parent_path);
detach_mnt(root.mnt, &root_parent);
/* mount old root on put_old */
- attach_mnt(root.mnt, &old_nd.path);
+ attach_mnt(root.mnt, &old);
/* mount new_root on / */
- attach_mnt(new_nd.path.mnt, &root_parent);
+ attach_mnt(new.mnt, &root_parent);
touch_mnt_namespace(current->nsproxy->mnt_ns);
spin_unlock(&vfsmount_lock);
- chroot_fs_refs(&root, &new_nd.path);
- security_sb_post_pivotroot(&root, &new_nd.path);
+ chroot_fs_refs(&root, &new);
+ security_sb_post_pivotroot(&root, &new);
error = 0;
path_put(&root_parent);
path_put(&parent_path);
out2:
- mutex_unlock(&old_nd.path.dentry->d_inode->i_mutex);
+ mutex_unlock(&old.dentry->d_inode->i_mutex);
up_write(&namespace_sem);
path_put(&root);
- path_put(&old_nd.path);
+ path_put(&old);
out1:
- path_put(&new_nd.path);
+ path_put(&new);
out0:
return error;
out3:
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 011ef0b6d2d..07e9715b865 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -266,7 +266,7 @@ leave_me:;
static int
-__ncp_lookup_validate(struct dentry * dentry, struct nameidata *nd)
+__ncp_lookup_validate(struct dentry *dentry)
{
struct ncp_server *server;
struct dentry *parent;
@@ -340,7 +340,7 @@ ncp_lookup_validate(struct dentry * dentry, struct nameidata *nd)
{
int res;
lock_kernel();
- res = __ncp_lookup_validate(dentry, nd);
+ res = __ncp_lookup_validate(dentry);
unlock_kernel();
return res;
}
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 2b145de45b3..6a7d901f193 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -18,6 +18,7 @@
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/sched.h>
+#include <linux/smp_lock.h>
#include <linux/ncp_fs.h>
#include "ncplib_kernel.h"
@@ -281,9 +282,18 @@ static int ncp_release(struct inode *inode, struct file *file) {
return 0;
}
+static loff_t ncp_remote_llseek(struct file *file, loff_t offset, int origin)
+{
+ loff_t ret;
+ lock_kernel();
+ ret = generic_file_llseek_unlocked(file, offset, origin);
+ unlock_kernel();
+ return ret;
+}
+
const struct file_operations ncp_file_operations =
{
- .llseek = remote_llseek,
+ .llseek = ncp_remote_llseek,
.read = ncp_file_read,
.write = ncp_file_write,
.ioctl = ncp_ioctl,
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 2e5ab1204de..d642f0e5b36 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -64,7 +64,7 @@ static void ncp_destroy_inode(struct inode *inode)
kmem_cache_free(ncp_inode_cachep, NCP_FINFO(inode));
}
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
{
struct ncp_inode_info *ei = (struct ncp_inode_info *) foo;
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index c1e7c830062..6a09760c596 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -27,7 +27,7 @@
struct nfs_callback_data {
unsigned int users;
- struct svc_serv *serv;
+ struct svc_rqst *rqst;
struct task_struct *task;
};
@@ -91,25 +91,22 @@ nfs_callback_svc(void *vrqstp)
svc_process(rqstp);
}
unlock_kernel();
- nfs_callback_info.task = NULL;
- svc_exit_thread(rqstp);
return 0;
}
/*
- * Bring up the server process if it is not already up.
+ * Bring up the callback thread if it is not already up.
*/
int nfs_callback_up(void)
{
struct svc_serv *serv = NULL;
- struct svc_rqst *rqstp;
int ret = 0;
- lock_kernel();
mutex_lock(&nfs_callback_mutex);
if (nfs_callback_info.users++ || nfs_callback_info.task != NULL)
goto out;
- serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL);
+ serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE,
+ AF_INET, NULL);
ret = -ENOMEM;
if (!serv)
goto out_err;
@@ -121,22 +118,23 @@ int nfs_callback_up(void)
nfs_callback_tcpport = ret;
dprintk("Callback port = 0x%x\n", nfs_callback_tcpport);
- rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]);
- if (IS_ERR(rqstp)) {
- ret = PTR_ERR(rqstp);
+ nfs_callback_info.rqst = svc_prepare_thread(serv, &serv->sv_pools[0]);
+ if (IS_ERR(nfs_callback_info.rqst)) {
+ ret = PTR_ERR(nfs_callback_info.rqst);
+ nfs_callback_info.rqst = NULL;
goto out_err;
}
svc_sock_update_bufs(serv);
- nfs_callback_info.serv = serv;
- nfs_callback_info.task = kthread_run(nfs_callback_svc, rqstp,
+ nfs_callback_info.task = kthread_run(nfs_callback_svc,
+ nfs_callback_info.rqst,
"nfsv4-svc");
if (IS_ERR(nfs_callback_info.task)) {
ret = PTR_ERR(nfs_callback_info.task);
- nfs_callback_info.serv = NULL;
+ svc_exit_thread(nfs_callback_info.rqst);
+ nfs_callback_info.rqst = NULL;
nfs_callback_info.task = NULL;
- svc_exit_thread(rqstp);
goto out_err;
}
out:
@@ -149,7 +147,6 @@ out:
if (serv)
svc_destroy(serv);
mutex_unlock(&nfs_callback_mutex);
- unlock_kernel();
return ret;
out_err:
dprintk("Couldn't create callback socket or server thread; err = %d\n",
@@ -159,17 +156,19 @@ out_err:
}
/*
- * Kill the server process if it is not already down.
+ * Kill the callback thread if it's no longer being used.
*/
void nfs_callback_down(void)
{
- lock_kernel();
mutex_lock(&nfs_callback_mutex);
nfs_callback_info.users--;
- if (nfs_callback_info.users == 0 && nfs_callback_info.task != NULL)
+ if (nfs_callback_info.users == 0 && nfs_callback_info.task != NULL) {
kthread_stop(nfs_callback_info.task);
+ svc_exit_thread(nfs_callback_info.rqst);
+ nfs_callback_info.rqst = NULL;
+ nfs_callback_info.task = NULL;
+ }
mutex_unlock(&nfs_callback_mutex);
- unlock_kernel();
}
static int nfs_callback_authenticate(struct svc_rqst *rqstp)
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index f2a092ca69b..5ee23e7058b 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -431,14 +431,14 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
{
to->to_initval = timeo * HZ / 10;
to->to_retries = retrans;
- if (!to->to_retries)
- to->to_retries = 2;
switch (proto) {
case XPRT_TRANSPORT_TCP:
case XPRT_TRANSPORT_RDMA:
+ if (to->to_retries == 0)
+ to->to_retries = NFS_DEF_TCP_RETRANS;
if (to->to_initval == 0)
- to->to_initval = 60 * HZ;
+ to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10;
if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
to->to_initval = NFS_MAX_TCP_TIMEOUT;
to->to_increment = to->to_initval;
@@ -450,14 +450,17 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
to->to_exponential = 0;
break;
case XPRT_TRANSPORT_UDP:
- default:
+ if (to->to_retries == 0)
+ to->to_retries = NFS_DEF_UDP_RETRANS;
if (!to->to_initval)
- to->to_initval = 11 * HZ / 10;
+ to->to_initval = NFS_DEF_UDP_TIMEO * HZ / 10;
if (to->to_initval > NFS_MAX_UDP_TIMEOUT)
to->to_initval = NFS_MAX_UDP_TIMEOUT;
to->to_maxval = NFS_MAX_UDP_TIMEOUT;
to->to_exponential = 1;
break;
+ default:
+ BUG();
}
}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 58d43daec08..74f92b717f7 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -133,13 +133,14 @@ nfs_opendir(struct inode *inode, struct file *filp)
{
int res;
- dfprintk(VFS, "NFS: opendir(%s/%ld)\n",
- inode->i_sb->s_id, inode->i_ino);
+ dfprintk(FILE, "NFS: open dir(%s/%s)\n",
+ filp->f_path.dentry->d_parent->d_name.name,
+ filp->f_path.dentry->d_name.name);
+
+ nfs_inc_stats(inode, NFSIOS_VFSOPEN);
- lock_kernel();
/* Call generic open code in order to cache credentials */
res = nfs_open(inode, filp);
- unlock_kernel();
return res;
}
@@ -204,7 +205,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
* Note: assumes we have exclusive access to this mapping either
* through inode->i_mutex or some other mechanism.
*/
- if (page->index == 0 && invalidate_inode_pages2_range(inode->i_mapping, PAGE_CACHE_SIZE, -1) < 0) {
+ if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
/* Should never happen */
nfs_zap_mapping(inode, inode->i_mapping);
}
@@ -528,13 +529,11 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
struct nfs_fattr fattr;
long res;
- dfprintk(VFS, "NFS: readdir(%s/%s) starting at cookie %Lu\n",
+ dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
(long long)filp->f_pos);
nfs_inc_stats(inode, NFSIOS_VFSGETDENTS);
- lock_kernel();
-
/*
* filp->f_pos points to the dirent entry number.
* *desc->dir_cookie has the cookie for the next entry. We have
@@ -592,10 +591,9 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
}
out:
nfs_unblock_sillyrename(dentry);
- unlock_kernel();
if (res > 0)
res = 0;
- dfprintk(VFS, "NFS: readdir(%s/%s) returns %ld\n",
+ dfprintk(FILE, "NFS: readdir(%s/%s) returns %ld\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
res);
return res;
@@ -603,7 +601,15 @@ out:
static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
{
- mutex_lock(&filp->f_path.dentry->d_inode->i_mutex);
+ struct dentry *dentry = filp->f_path.dentry;
+ struct inode *inode = dentry->d_inode;
+
+ dfprintk(FILE, "NFS: llseek dir(%s/%s, %lld, %d)\n",
+ dentry->d_parent->d_name.name,
+ dentry->d_name.name,
+ offset, origin);
+
+ mutex_lock(&inode->i_mutex);
switch (origin) {
case 1:
offset += filp->f_pos;
@@ -619,7 +625,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
nfs_file_open_context(filp)->dir_cookie = 0;
}
out:
- mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex);
+ mutex_unlock(&inode->i_mutex);
return offset;
}
@@ -629,10 +635,11 @@ out:
*/
static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync)
{
- dfprintk(VFS, "NFS: fsync_dir(%s/%s) datasync %d\n",
+ dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
datasync);
+ nfs_inc_stats(dentry->d_inode, NFSIOS_VFSFSYNC);
return 0;
}
@@ -767,7 +774,6 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
struct nfs_fattr fattr;
parent = dget_parent(dentry);
- lock_kernel();
dir = parent->d_inode;
nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
inode = dentry->d_inode;
@@ -805,7 +811,6 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
out_valid:
- unlock_kernel();
dput(parent);
dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is valid\n",
__func__, dentry->d_parent->d_name.name,
@@ -824,7 +829,6 @@ out_zap_parent:
shrink_dcache_parent(dentry);
}
d_drop(dentry);
- unlock_kernel();
dput(parent);
dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n",
__func__, dentry->d_parent->d_name.name,
@@ -858,6 +862,14 @@ static int nfs_dentry_delete(struct dentry *dentry)
}
+static void nfs_drop_nlink(struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ if (inode->i_nlink > 0)
+ drop_nlink(inode);
+ spin_unlock(&inode->i_lock);
+}
+
/*
* Called when the dentry loses inode.
* We use it to clean up silly-renamed files.
@@ -869,10 +881,8 @@ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
- lock_kernel();
drop_nlink(inode);
nfs_complete_unlink(dentry, inode);
- unlock_kernel();
}
iput(inode);
}
@@ -903,8 +913,6 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
res = ERR_PTR(-ENOMEM);
dentry->d_op = NFS_PROTO(dir)->dentry_ops;
- lock_kernel();
-
/*
* If we're doing an exclusive create, optimize away the lookup
* but don't hash the dentry.
@@ -912,7 +920,7 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
if (nfs_is_exclusive_create(dir, nd)) {
d_instantiate(dentry, NULL);
res = NULL;
- goto out_unlock;
+ goto out;
}
parent = dentry->d_parent;
@@ -940,8 +948,6 @@ no_entry:
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
out_unblock_sillyrename:
nfs_unblock_sillyrename(parent);
-out_unlock:
- unlock_kernel();
out:
return res;
}
@@ -999,9 +1005,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
}
/* Open the file on the server */
- lock_kernel();
res = nfs4_atomic_open(dir, dentry, nd);
- unlock_kernel();
if (IS_ERR(res)) {
error = PTR_ERR(res);
switch (error) {
@@ -1063,9 +1067,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
* operations that change the directory. We therefore save the
* change attribute *before* we do the RPC call.
*/
- lock_kernel();
ret = nfs4_open_revalidate(dir, dentry, openflags, nd);
- unlock_kernel();
out:
dput(parent);
if (!ret)
@@ -1218,14 +1220,11 @@ static int nfs_create(struct inode *dir, struct dentry *dentry, int mode,
if ((nd->flags & LOOKUP_CREATE) != 0)
open_flags = nd->intent.open.flags;
- lock_kernel();
error = NFS_PROTO(dir)->create(dir, dentry, &attr, open_flags, nd);
if (error != 0)
goto out_err;
- unlock_kernel();
return 0;
out_err:
- unlock_kernel();
d_drop(dentry);
return error;
}
@@ -1248,14 +1247,11 @@ nfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
attr.ia_mode = mode;
attr.ia_valid = ATTR_MODE;
- lock_kernel();
status = NFS_PROTO(dir)->mknod(dir, dentry, &attr, rdev);
if (status != 0)
goto out_err;
- unlock_kernel();
return 0;
out_err:
- unlock_kernel();
d_drop(dentry);
return status;
}
@@ -1274,15 +1270,12 @@ static int nfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
attr.ia_valid = ATTR_MODE;
attr.ia_mode = mode | S_IFDIR;
- lock_kernel();
error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr);
if (error != 0)
goto out_err;
- unlock_kernel();
return 0;
out_err:
d_drop(dentry);
- unlock_kernel();
return error;
}
@@ -1299,14 +1292,12 @@ static int nfs_rmdir(struct inode *dir, struct dentry *dentry)
dfprintk(VFS, "NFS: rmdir(%s/%ld), %s\n",
dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
- lock_kernel();
error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name);
/* Ensure the VFS deletes this inode */
if (error == 0 && dentry->d_inode != NULL)
clear_nlink(dentry->d_inode);
else if (error == -ENOENT)
nfs_dentry_handle_enoent(dentry);
- unlock_kernel();
return error;
}
@@ -1408,7 +1399,7 @@ static int nfs_safe_remove(struct dentry *dentry)
error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
/* The VFS may want to delete this inode */
if (error == 0)
- drop_nlink(inode);
+ nfs_drop_nlink(inode);
nfs_mark_for_revalidate(inode);
} else
error = NFS_PROTO(dir)->remove(dir, &dentry->d_name);
@@ -1431,7 +1422,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
dfprintk(VFS, "NFS: unlink(%s/%ld, %s)\n", dir->i_sb->s_id,
dir->i_ino, dentry->d_name.name);
- lock_kernel();
spin_lock(&dcache_lock);
spin_lock(&dentry->d_lock);
if (atomic_read(&dentry->d_count) > 1) {
@@ -1440,7 +1430,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
/* Start asynchronous writeout of the inode */
write_inode_now(dentry->d_inode, 0);
error = nfs_sillyrename(dir, dentry);
- unlock_kernel();
return error;
}
if (!d_unhashed(dentry)) {
@@ -1454,7 +1443,6 @@ static int nfs_unlink(struct inode *dir, struct dentry *dentry)
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
} else if (need_rehash)
d_rehash(dentry);
- unlock_kernel();
return error;
}
@@ -1491,13 +1479,9 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym
attr.ia_mode = S_IFLNK | S_IRWXUGO;
attr.ia_valid = ATTR_MODE;
- lock_kernel();
-
page = alloc_page(GFP_HIGHUSER);
- if (!page) {
- unlock_kernel();
+ if (!page)
return -ENOMEM;
- }
kaddr = kmap_atomic(page, KM_USER0);
memcpy(kaddr, symname, pathlen);
@@ -1512,7 +1496,6 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym
dentry->d_name.name, symname, error);
d_drop(dentry);
__free_page(page);
- unlock_kernel();
return error;
}
@@ -1530,7 +1513,6 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym
} else
__free_page(page);
- unlock_kernel();
return 0;
}
@@ -1544,14 +1526,12 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
old_dentry->d_parent->d_name.name, old_dentry->d_name.name,
dentry->d_parent->d_name.name, dentry->d_name.name);
- lock_kernel();
d_drop(dentry);
error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name);
if (error == 0) {
atomic_inc(&inode->i_count);
d_add(dentry, inode);
}
- unlock_kernel();
return error;
}
@@ -1591,7 +1571,6 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
* To prevent any new references to the target during the rename,
* we unhash the dentry and free the inode in advance.
*/
- lock_kernel();
if (!d_unhashed(new_dentry)) {
d_drop(new_dentry);
rehash = new_dentry;
@@ -1635,7 +1614,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
/* dentry still busy? */
goto out;
} else
- drop_nlink(new_inode);
+ nfs_drop_nlink(new_inode);
go_ahead:
/*
@@ -1669,7 +1648,6 @@ out:
/* new dentry created? */
if (dentry)
dput(dentry);
- unlock_kernel();
return error;
}
@@ -1906,7 +1884,7 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
return status;
nfs_access_add_cache(inode, &cache);
out:
- if ((cache.mask & mask) == mask)
+ if ((mask & ~cache.mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
return 0;
return -EACCES;
}
@@ -1929,17 +1907,17 @@ int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags)
return nfs_do_access(inode, cred, nfs_open_permission_mask(openflags));
}
-int nfs_permission(struct inode *inode, int mask, struct nameidata *nd)
+int nfs_permission(struct inode *inode, int mask)
{
struct rpc_cred *cred;
int res = 0;
nfs_inc_stats(inode, NFSIOS_VFSACCESS);
- if (mask == 0)
+ if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
goto out;
/* Is this sys_access() ? */
- if (nd != NULL && (nd->flags & LOOKUP_ACCESS))
+ if (mask & MAY_ACCESS)
goto force_lookup;
switch (inode->i_mode & S_IFMT) {
@@ -1948,8 +1926,7 @@ int nfs_permission(struct inode *inode, int mask, struct nameidata *nd)
case S_IFREG:
/* NFSv4 has atomic_open... */
if (nfs_server_capable(inode, NFS_CAP_ATOMIC_OPEN)
- && nd != NULL
- && (nd->flags & LOOKUP_OPEN))
+ && (mask & MAY_OPEN))
goto out;
break;
case S_IFDIR:
@@ -1962,8 +1939,6 @@ int nfs_permission(struct inode *inode, int mask, struct nameidata *nd)
}
force_lookup:
- lock_kernel();
-
if (!NFS_PROTO(inode)->access)
goto out_notsup;
@@ -1973,7 +1948,6 @@ force_lookup:
put_rpccred(cred);
} else
res = PTR_ERR(cred);
- unlock_kernel();
out:
dfprintk(VFS, "NFS: permission(%s/%ld), mask=0x%x, res=%d\n",
inode->i_sb->s_id, inode->i_ino, mask, res);
@@ -1982,7 +1956,6 @@ out_notsup:
res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
if (res == 0)
res = generic_permission(inode, mask, NULL);
- unlock_kernel();
goto out;
}
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 4757a2b326a..08f6b040d28 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -890,7 +890,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
count = iov_length(iov, nr_segs);
nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
- dprintk("nfs: direct read(%s/%s, %zd@%Ld)\n",
+ dfprintk(FILE, "NFS: direct read(%s/%s, %zd@%Ld)\n",
file->f_path.dentry->d_parent->d_name.name,
file->f_path.dentry->d_name.name,
count, (long long) pos);
@@ -947,7 +947,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
count = iov_length(iov, nr_segs);
nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
- dfprintk(VFS, "nfs: direct write(%s/%s, %zd@%Ld)\n",
+ dfprintk(FILE, "NFS: direct write(%s/%s, %zd@%Ld)\n",
file->f_path.dentry->d_parent->d_name.name,
file->f_path.dentry->d_name.name,
count, (long long) pos);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index d84a3d8f32a..78460657f5c 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -50,7 +50,7 @@ static ssize_t nfs_file_read(struct kiocb *, const struct iovec *iov,
static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov,
unsigned long nr_segs, loff_t pos);
static int nfs_file_flush(struct file *, fl_owner_t id);
-static int nfs_fsync(struct file *, struct dentry *dentry, int datasync);
+static int nfs_file_fsync(struct file *, struct dentry *dentry, int datasync);
static int nfs_check_flags(int flags);
static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
@@ -72,7 +72,7 @@ const struct file_operations nfs_file_operations = {
.open = nfs_file_open,
.flush = nfs_file_flush,
.release = nfs_file_release,
- .fsync = nfs_fsync,
+ .fsync = nfs_file_fsync,
.lock = nfs_lock,
.flock = nfs_flock,
.splice_read = nfs_file_splice_read,
@@ -119,25 +119,33 @@ nfs_file_open(struct inode *inode, struct file *filp)
{
int res;
+ dprintk("NFS: open file(%s/%s)\n",
+ filp->f_path.dentry->d_parent->d_name.name,
+ filp->f_path.dentry->d_name.name);
+
res = nfs_check_flags(filp->f_flags);
if (res)
return res;
nfs_inc_stats(inode, NFSIOS_VFSOPEN);
- lock_kernel();
- res = NFS_PROTO(inode)->file_open(inode, filp);
- unlock_kernel();
+ res = nfs_open(inode, filp);
return res;
}
static int
nfs_file_release(struct inode *inode, struct file *filp)
{
+ struct dentry *dentry = filp->f_path.dentry;
+
+ dprintk("NFS: release(%s/%s)\n",
+ dentry->d_parent->d_name.name,
+ dentry->d_name.name);
+
/* Ensure that dirty pages are flushed out with the right creds */
if (filp->f_mode & FMODE_WRITE)
- nfs_wb_all(filp->f_path.dentry->d_inode);
+ nfs_wb_all(dentry->d_inode);
nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
- return NFS_PROTO(inode)->file_release(inode, filp);
+ return nfs_release(inode, filp);
}
/**
@@ -170,6 +178,13 @@ force_reval:
static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
{
+ loff_t loff;
+
+ dprintk("NFS: llseek file(%s/%s, %lld, %d)\n",
+ filp->f_path.dentry->d_parent->d_name.name,
+ filp->f_path.dentry->d_name.name,
+ offset, origin);
+
/* origin == SEEK_END => we must revalidate the cached file length */
if (origin == SEEK_END) {
struct inode *inode = filp->f_mapping->host;
@@ -177,11 +192,14 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
if (retval < 0)
return (loff_t)retval;
}
- return remote_llseek(filp, offset, origin);
+ lock_kernel(); /* BKL needed? */
+ loff = generic_file_llseek_unlocked(filp, offset, origin);
+ unlock_kernel();
+ return loff;
}
/*
- * Helper for nfs_file_flush() and nfs_fsync()
+ * Helper for nfs_file_flush() and nfs_file_fsync()
*
* Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to
* disk, but it retrieves and clears ctx->error after synching, despite
@@ -207,16 +225,18 @@ static int nfs_do_fsync(struct nfs_open_context *ctx, struct inode *inode)
/*
* Flush all dirty pages, and check for write errors.
- *
*/
static int
nfs_file_flush(struct file *file, fl_owner_t id)
{
struct nfs_open_context *ctx = nfs_file_open_context(file);
- struct inode *inode = file->f_path.dentry->d_inode;
+ struct dentry *dentry = file->f_path.dentry;
+ struct inode *inode = dentry->d_inode;
int status;
- dfprintk(VFS, "nfs: flush(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino);
+ dprintk("NFS: flush(%s/%s)\n",
+ dentry->d_parent->d_name.name,
+ dentry->d_name.name);
if ((file->f_mode & FMODE_WRITE) == 0)
return 0;
@@ -241,7 +261,7 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
if (iocb->ki_filp->f_flags & O_DIRECT)
return nfs_file_direct_read(iocb, iov, nr_segs, pos);
- dfprintk(VFS, "nfs: read(%s/%s, %lu@%lu)\n",
+ dprintk("NFS: read(%s/%s, %lu@%lu)\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
(unsigned long) count, (unsigned long) pos);
@@ -261,7 +281,7 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos,
struct inode *inode = dentry->d_inode;
ssize_t res;
- dfprintk(VFS, "nfs: splice_read(%s/%s, %lu@%Lu)\n",
+ dprintk("NFS: splice_read(%s/%s, %lu@%Lu)\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
(unsigned long) count, (unsigned long long) *ppos);
@@ -278,7 +298,7 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
struct inode *inode = dentry->d_inode;
int status;
- dfprintk(VFS, "nfs: mmap(%s/%s)\n",
+ dprintk("NFS: mmap(%s/%s)\n",
dentry->d_parent->d_name.name, dentry->d_name.name);
status = nfs_revalidate_mapping(inode, file->f_mapping);
@@ -296,12 +316,14 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
* whether any write errors occurred for this process.
*/
static int
-nfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+nfs_file_fsync(struct file *file, struct dentry *dentry, int datasync)
{
struct nfs_open_context *ctx = nfs_file_open_context(file);
struct inode *inode = dentry->d_inode;
- dfprintk(VFS, "nfs: fsync(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino);
+ dprintk("NFS: fsync file(%s/%s) datasync %d\n",
+ dentry->d_parent->d_name.name, dentry->d_name.name,
+ datasync);
nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
return nfs_do_fsync(ctx, inode);
@@ -324,6 +346,11 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
struct page *page;
index = pos >> PAGE_CACHE_SHIFT;
+ dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n",
+ file->f_path.dentry->d_parent->d_name.name,
+ file->f_path.dentry->d_name.name,
+ mapping->host->i_ino, len, (long long) pos);
+
page = __grab_cache_page(mapping, index);
if (!page)
return -ENOMEM;
@@ -344,9 +371,32 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
int status;
- lock_kernel();
+ dfprintk(PAGECACHE, "NFS: write_end(%s/%s(%ld), %u@%lld)\n",
+ file->f_path.dentry->d_parent->d_name.name,
+ file->f_path.dentry->d_name.name,
+ mapping->host->i_ino, len, (long long) pos);
+
+ /*
+ * Zero any uninitialised parts of the page, and then mark the page
+ * as up to date if it turns out that we're extending the file.
+ */
+ if (!PageUptodate(page)) {
+ unsigned pglen = nfs_page_length(page);
+ unsigned end = offset + len;
+
+ if (pglen == 0) {
+ zero_user_segments(page, 0, offset,
+ end, PAGE_CACHE_SIZE);
+ SetPageUptodate(page);
+ } else if (end >= pglen) {
+ zero_user_segment(page, end, PAGE_CACHE_SIZE);
+ if (offset == 0)
+ SetPageUptodate(page);
+ } else
+ zero_user_segment(page, pglen, PAGE_CACHE_SIZE);
+ }
+
status = nfs_updatepage(file, page, offset, copied);
- unlock_kernel();
unlock_page(page);
page_cache_release(page);
@@ -358,6 +408,8 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
static void nfs_invalidate_page(struct page *page, unsigned long offset)
{
+ dfprintk(PAGECACHE, "NFS: invalidate_page(%p, %lu)\n", page, offset);
+
if (offset != 0)
return;
/* Cancel any unstarted writes on this page */
@@ -366,13 +418,20 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset)
static int nfs_release_page(struct page *page, gfp_t gfp)
{
+ dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
+
/* If PagePrivate() is set, then the page is not freeable */
return 0;
}
static int nfs_launder_page(struct page *page)
{
- return nfs_wb_page(page->mapping->host, page);
+ struct inode *inode = page->mapping->host;
+
+ dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n",
+ inode->i_ino, (long long)page_offset(page));
+
+ return nfs_wb_page(inode, page);
}
const struct address_space_operations nfs_file_aops = {
@@ -392,13 +451,19 @@ const struct address_space_operations nfs_file_aops = {
static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
{
struct file *filp = vma->vm_file;
+ struct dentry *dentry = filp->f_path.dentry;
unsigned pagelen;
int ret = -EINVAL;
struct address_space *mapping;
+ dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%s/%s(%ld), offset %lld)\n",
+ dentry->d_parent->d_name.name, dentry->d_name.name,
+ filp->f_mapping->host->i_ino,
+ (long long)page_offset(page));
+
lock_page(page);
mapping = page->mapping;
- if (mapping != vma->vm_file->f_path.dentry->d_inode->i_mapping)
+ if (mapping != dentry->d_inode->i_mapping)
goto out_unlock;
ret = 0;
@@ -446,9 +511,9 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
if (iocb->ki_filp->f_flags & O_DIRECT)
return nfs_file_direct_write(iocb, iov, nr_segs, pos);
- dfprintk(VFS, "nfs: write(%s/%s(%ld), %lu@%Ld)\n",
+ dprintk("NFS: write(%s/%s, %lu@%Ld)\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
- inode->i_ino, (unsigned long) count, (long long) pos);
+ (unsigned long) count, (long long) pos);
result = -EBUSY;
if (IS_SWAPFILE(inode))
@@ -582,7 +647,8 @@ static int do_setlk(struct file *filp, int cmd, struct file_lock *fl)
* This makes locking act as a cache coherency point.
*/
nfs_sync_mapping(filp->f_mapping);
- nfs_zap_caches(inode);
+ if (!nfs_have_delegation(inode, FMODE_READ))
+ nfs_zap_caches(inode);
out:
return status;
}
@@ -592,23 +658,35 @@ out:
*/
static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
{
- struct inode * inode = filp->f_mapping->host;
+ struct inode *inode = filp->f_mapping->host;
+ int ret = -ENOLCK;
- dprintk("NFS: nfs_lock(f=%s/%ld, t=%x, fl=%x, r=%Ld:%Ld)\n",
- inode->i_sb->s_id, inode->i_ino,
+ dprintk("NFS: lock(%s/%s, t=%x, fl=%x, r=%lld:%lld)\n",
+ filp->f_path.dentry->d_parent->d_name.name,
+ filp->f_path.dentry->d_name.name,
fl->fl_type, fl->fl_flags,
(long long)fl->fl_start, (long long)fl->fl_end);
+
nfs_inc_stats(inode, NFSIOS_VFSLOCK);
/* No mandatory locks over NFS */
if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
- return -ENOLCK;
+ goto out_err;
+
+ if (NFS_PROTO(inode)->lock_check_bounds != NULL) {
+ ret = NFS_PROTO(inode)->lock_check_bounds(fl);
+ if (ret < 0)
+ goto out_err;
+ }
if (IS_GETLK(cmd))
- return do_getlk(filp, cmd, fl);
- if (fl->fl_type == F_UNLCK)
- return do_unlk(filp, cmd, fl);
- return do_setlk(filp, cmd, fl);
+ ret = do_getlk(filp, cmd, fl);
+ else if (fl->fl_type == F_UNLCK)
+ ret = do_unlk(filp, cmd, fl);
+ else
+ ret = do_setlk(filp, cmd, fl);
+out_err:
+ return ret;
}
/*
@@ -616,9 +694,9 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
*/
static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
{
- dprintk("NFS: nfs_flock(f=%s/%ld, t=%x, fl=%x)\n",
- filp->f_path.dentry->d_inode->i_sb->s_id,
- filp->f_path.dentry->d_inode->i_ino,
+ dprintk("NFS: flock(%s/%s, t=%x, fl=%x)\n",
+ filp->f_path.dentry->d_parent->d_name.name,
+ filp->f_path.dentry->d_name.name,
fl->fl_type, fl->fl_flags);
/*
@@ -641,12 +719,15 @@ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
return do_setlk(filp, cmd, fl);
}
+/*
+ * There is no protocol support for leases, so we have no way to implement
+ * them correctly in the face of opens by other clients.
+ */
static int nfs_setlease(struct file *file, long arg, struct file_lock **fl)
{
- /*
- * There is no protocol support for leases, so we have no way
- * to implement them correctly in the face of opens by other
- * clients.
- */
+ dprintk("NFS: setlease(%s/%s, arg=%ld)\n",
+ file->f_path.dentry->d_parent->d_name.name,
+ file->f_path.dentry->d_name.name, arg);
+
return -EINVAL;
}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 596c5d8e86f..52daefa2f52 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -57,8 +57,6 @@ static int enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED;
static void nfs_invalidate_inode(struct inode *);
static int nfs_update_inode(struct inode *, struct nfs_fattr *);
-static void nfs_zap_acl_cache(struct inode *);
-
static struct kmem_cache * nfs_inode_cachep;
static inline unsigned long
@@ -167,7 +165,7 @@ void nfs_zap_mapping(struct inode *inode, struct address_space *mapping)
}
}
-static void nfs_zap_acl_cache(struct inode *inode)
+void nfs_zap_acl_cache(struct inode *inode)
{
void (*clear_acl_cache)(struct inode *);
@@ -347,7 +345,7 @@ out_no_inode:
goto out;
}
-#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET)
+#define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE)
int
nfs_setattr(struct dentry *dentry, struct iattr *attr)
@@ -369,10 +367,9 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
/* Optimization: if the end result is no change, don't RPC */
attr->ia_valid &= NFS_VALID_ATTRS;
- if (attr->ia_valid == 0)
+ if ((attr->ia_valid & ~ATTR_FILE) == 0)
return 0;
- lock_kernel();
/* Write all dirty data */
if (S_ISREG(inode->i_mode)) {
filemap_write_and_wait(inode->i_mapping);
@@ -386,11 +383,66 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
error = NFS_PROTO(inode)->setattr(dentry, &fattr, attr);
if (error == 0)
nfs_refresh_inode(inode, &fattr);
- unlock_kernel();
return error;
}
/**
+ * nfs_vmtruncate - unmap mappings "freed" by truncate() syscall
+ * @inode: inode of the file used
+ * @offset: file offset to start truncating
+ *
+ * This is a copy of the common vmtruncate, but with the locking
+ * corrected to take into account the fact that NFS requires
+ * inode->i_size to be updated under the inode->i_lock.
+ */
+static int nfs_vmtruncate(struct inode * inode, loff_t offset)
+{
+ if (i_size_read(inode) < offset) {
+ unsigned long limit;
+
+ limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+ if (limit != RLIM_INFINITY && offset > limit)
+ goto out_sig;
+ if (offset > inode->i_sb->s_maxbytes)
+ goto out_big;
+ spin_lock(&inode->i_lock);
+ i_size_write(inode, offset);
+ spin_unlock(&inode->i_lock);
+ } else {
+ struct address_space *mapping = inode->i_mapping;
+
+ /*
+ * truncation of in-use swapfiles is disallowed - it would
+ * cause subsequent swapout to scribble on the now-freed
+ * blocks.
+ */
+ if (IS_SWAPFILE(inode))
+ return -ETXTBSY;
+ spin_lock(&inode->i_lock);
+ i_size_write(inode, offset);
+ spin_unlock(&inode->i_lock);
+
+ /*
+ * unmap_mapping_range is called twice, first simply for
+ * efficiency so that truncate_inode_pages does fewer
+ * single-page unmaps. However after this first call, and
+ * before truncate_inode_pages finishes, it is possible for
+ * private pages to be COWed, which remain after
+ * truncate_inode_pages finishes, hence the second
+ * unmap_mapping_range call must be made for correctness.
+ */
+ unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
+ truncate_inode_pages(mapping, offset);
+ unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
+ }
+ return 0;
+out_sig:
+ send_sig(SIGXFSZ, current, 0);
+out_big:
+ return -EFBIG;
+}
+
+/**
* nfs_setattr_update_inode - Update inode metadata after a setattr call.
* @inode: pointer to struct inode
* @attr: pointer to struct iattr
@@ -416,8 +468,7 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
}
if ((attr->ia_valid & ATTR_SIZE) != 0) {
nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC);
- inode->i_size = attr->ia_size;
- vmtruncate(inode, attr->ia_size);
+ nfs_vmtruncate(inode, attr->ia_size);
}
}
@@ -647,7 +698,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
inode->i_sb->s_id, (long long)NFS_FILEID(inode));
nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
- lock_kernel();
if (is_bad_inode(inode))
goto out_nowait;
if (NFS_STALE(inode))
@@ -696,7 +746,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
nfs_wake_up_inode(inode);
out_nowait:
- unlock_kernel();
return status;
}
@@ -831,9 +880,9 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
if (S_ISDIR(inode->i_mode))
nfsi->cache_validity |= NFS_INO_INVALID_DATA;
}
- if (inode->i_size == nfs_size_to_loff_t(fattr->pre_size) &&
+ if (i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) &&
nfsi->npages == 0)
- inode->i_size = nfs_size_to_loff_t(fattr->size);
+ i_size_write(inode, nfs_size_to_loff_t(fattr->size));
}
}
@@ -974,7 +1023,7 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa
(fattr->valid & NFS_ATTR_WCC) == 0) {
memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime));
memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime));
- fattr->pre_size = inode->i_size;
+ fattr->pre_size = i_size_read(inode);
fattr->valid |= NFS_ATTR_WCC;
}
return nfs_post_op_update_inode(inode, fattr);
@@ -1059,7 +1108,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
/* Do we perhaps have any outstanding writes, or has
* the file grown beyond our last write? */
if (nfsi->npages == 0 || new_isize > cur_isize) {
- inode->i_size = new_isize;
+ i_size_write(inode, new_isize);
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
}
dprintk("NFS: isize change on server for file %s/%ld\n",
@@ -1193,7 +1242,7 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi)
#endif
}
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
{
struct nfs_inode *nfsi = (struct nfs_inode *) foo;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 04ae867dddb..24241fcbb98 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -150,6 +150,7 @@ extern void nfs_clear_inode(struct inode *);
#ifdef CONFIG_NFS_V4
extern void nfs4_clear_inode(struct inode *);
#endif
+void nfs_zap_acl_cache(struct inode *inode);
/* super.c */
extern struct file_system_type nfs_xdev_fs_type;
diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h
index 6350ecbde58..a3695281003 100644
--- a/fs/nfs/iostat.h
+++ b/fs/nfs/iostat.h
@@ -5,135 +5,41 @@
*
* Copyright (C) 2005, 2006 Chuck Lever <cel@netapp.com>
*
- * NFS client per-mount statistics provide information about the health of
- * the NFS client and the health of each NFS mount point. Generally these
- * are not for detailed problem diagnosis, but simply to indicate that there
- * is a problem.
- *
- * These counters are not meant to be human-readable, but are meant to be
- * integrated into system monitoring tools such as "sar" and "iostat". As
- * such, the counters are sampled by the tools over time, and are never
- * zeroed after a file system is mounted. Moving averages can be computed
- * by the tools by taking the difference between two instantaneous samples
- * and dividing that by the time between the samples.
*/
#ifndef _NFS_IOSTAT
#define _NFS_IOSTAT
-#define NFS_IOSTAT_VERS "1.0"
-
-/*
- * NFS byte counters
- *
- * 1. SERVER - the number of payload bytes read from or written to the
- * server by the NFS client via an NFS READ or WRITE request.
- *
- * 2. NORMAL - the number of bytes read or written by applications via
- * the read(2) and write(2) system call interfaces.
- *
- * 3. DIRECT - the number of bytes read or written from files opened
- * with the O_DIRECT flag.
- *
- * These counters give a view of the data throughput into and out of the NFS
- * client. Comparing the number of bytes requested by an application with the
- * number of bytes the client requests from the server can provide an
- * indication of client efficiency (per-op, cache hits, etc).
- *
- * These counters can also help characterize which access methods are in
- * use. DIRECT by itself shows whether there is any O_DIRECT traffic.
- * NORMAL + DIRECT shows how much data is going through the system call
- * interface. A large amount of SERVER traffic without much NORMAL or
- * DIRECT traffic shows that applications are using mapped files.
- *
- * NFS page counters
- *
- * These count the number of pages read or written via nfs_readpage(),
- * nfs_readpages(), or their write equivalents.
- */
-enum nfs_stat_bytecounters {
- NFSIOS_NORMALREADBYTES = 0,
- NFSIOS_NORMALWRITTENBYTES,
- NFSIOS_DIRECTREADBYTES,
- NFSIOS_DIRECTWRITTENBYTES,
- NFSIOS_SERVERREADBYTES,
- NFSIOS_SERVERWRITTENBYTES,
- NFSIOS_READPAGES,
- NFSIOS_WRITEPAGES,
- __NFSIOS_BYTESMAX,
-};
-
-/*
- * NFS event counters
- *
- * These counters provide a low-overhead way of monitoring client activity
- * without enabling NFS trace debugging. The counters show the rate at
- * which VFS requests are made, and how often the client invalidates its
- * data and attribute caches. This allows system administrators to monitor
- * such things as how close-to-open is working, and answer questions such
- * as "why are there so many GETATTR requests on the wire?"
- *
- * They also count anamolous events such as short reads and writes, silly
- * renames due to close-after-delete, and operations that change the size
- * of a file (such operations can often be the source of data corruption
- * if applications aren't using file locking properly).
- */
-enum nfs_stat_eventcounters {
- NFSIOS_INODEREVALIDATE = 0,
- NFSIOS_DENTRYREVALIDATE,
- NFSIOS_DATAINVALIDATE,
- NFSIOS_ATTRINVALIDATE,
- NFSIOS_VFSOPEN,
- NFSIOS_VFSLOOKUP,
- NFSIOS_VFSACCESS,
- NFSIOS_VFSUPDATEPAGE,
- NFSIOS_VFSREADPAGE,
- NFSIOS_VFSREADPAGES,
- NFSIOS_VFSWRITEPAGE,
- NFSIOS_VFSWRITEPAGES,
- NFSIOS_VFSGETDENTS,
- NFSIOS_VFSSETATTR,
- NFSIOS_VFSFLUSH,
- NFSIOS_VFSFSYNC,
- NFSIOS_VFSLOCK,
- NFSIOS_VFSRELEASE,
- NFSIOS_CONGESTIONWAIT,
- NFSIOS_SETATTRTRUNC,
- NFSIOS_EXTENDWRITE,
- NFSIOS_SILLYRENAME,
- NFSIOS_SHORTREAD,
- NFSIOS_SHORTWRITE,
- NFSIOS_DELAY,
- __NFSIOS_COUNTSMAX,
-};
-
-#ifdef __KERNEL__
-
#include <linux/percpu.h>
#include <linux/cache.h>
+#include <linux/nfs_iostat.h>
struct nfs_iostats {
unsigned long long bytes[__NFSIOS_BYTESMAX];
unsigned long events[__NFSIOS_COUNTSMAX];
} ____cacheline_aligned;
-static inline void nfs_inc_server_stats(struct nfs_server *server, enum nfs_stat_eventcounters stat)
+static inline void nfs_inc_server_stats(const struct nfs_server *server,
+ enum nfs_stat_eventcounters stat)
{
struct nfs_iostats *iostats;
int cpu;
cpu = get_cpu();
iostats = per_cpu_ptr(server->io_stats, cpu);
- iostats->events[stat] ++;
+ iostats->events[stat]++;
put_cpu_no_resched();
}
-static inline void nfs_inc_stats(struct inode *inode, enum nfs_stat_eventcounters stat)
+static inline void nfs_inc_stats(const struct inode *inode,
+ enum nfs_stat_eventcounters stat)
{
nfs_inc_server_stats(NFS_SERVER(inode), stat);
}
-static inline void nfs_add_server_stats(struct nfs_server *server, enum nfs_stat_bytecounters stat, unsigned long addend)
+static inline void nfs_add_server_stats(const struct nfs_server *server,
+ enum nfs_stat_bytecounters stat,
+ unsigned long addend)
{
struct nfs_iostats *iostats;
int cpu;
@@ -144,7 +50,9 @@ static inline void nfs_add_server_stats(struct nfs_server *server, enum nfs_stat
put_cpu_no_resched();
}
-static inline void nfs_add_stats(struct inode *inode, enum nfs_stat_bytecounters stat, unsigned long addend)
+static inline void nfs_add_stats(const struct inode *inode,
+ enum nfs_stat_bytecounters stat,
+ unsigned long addend)
{
nfs_add_server_stats(NFS_SERVER(inode), stat, addend);
}
@@ -160,5 +68,4 @@ static inline void nfs_free_iostats(struct nfs_iostats *stats)
free_percpu(stats);
}
-#endif
-#endif
+#endif /* _NFS_IOSTAT */
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 2f285ef7639..66df08dd1ca 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -129,7 +129,7 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
goto out_err;
mntget(mnt);
- err = do_add_mount(mnt, nd, nd->path.mnt->mnt_flags|MNT_SHRINKABLE,
+ err = do_add_mount(mnt, &nd->path, nd->path.mnt->mnt_flags|MNT_SHRINKABLE,
&nfs_automount_list);
if (err < 0) {
mntput(mnt);
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 9b7362565c0..423842f51ac 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -5,6 +5,8 @@
#include <linux/posix_acl_xattr.h>
#include <linux/nfsacl.h>
+#include "internal.h"
+
#define NFSDBG_FACILITY NFSDBG_PROC
ssize_t nfs3_listxattr(struct dentry *dentry, char *buffer, size_t size)
@@ -205,6 +207,8 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
status = nfs_revalidate_inode(server, inode);
if (status < 0)
return ERR_PTR(status);
+ if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
+ nfs_zap_acl_cache(inode);
acl = nfs3_get_cached_acl(inode, type);
if (acl != ERR_PTR(-EAGAIN))
return acl;
@@ -319,9 +323,8 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
dprintk("NFS call setacl\n");
msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL];
status = rpc_call_sync(server->client_acl, &msg, 0);
- spin_lock(&inode->i_lock);
- NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS;
- spin_unlock(&inode->i_lock);
+ nfs_access_zap_cache(inode);
+ nfs_zap_acl_cache(inode);
dprintk("NFS reply setacl: %d\n", status);
/* pages may have been allocated at the xdr layer. */
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index c3523ad03ed..1e750e4574a 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -129,6 +129,8 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
int status;
dprintk("NFS call setattr\n");
+ if (sattr->ia_valid & ATTR_FILE)
+ msg.rpc_cred = nfs_file_cred(sattr->ia_file);
nfs_fattr_init(fattr);
status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
if (status == 0)
@@ -248,6 +250,53 @@ static int nfs3_proc_readlink(struct inode *inode, struct page *page,
return status;
}
+struct nfs3_createdata {
+ struct rpc_message msg;
+ union {
+ struct nfs3_createargs create;
+ struct nfs3_mkdirargs mkdir;
+ struct nfs3_symlinkargs symlink;
+ struct nfs3_mknodargs mknod;
+ } arg;
+ struct nfs3_diropres res;
+ struct nfs_fh fh;
+ struct nfs_fattr fattr;
+ struct nfs_fattr dir_attr;
+};
+
+static struct nfs3_createdata *nfs3_alloc_createdata(void)
+{
+ struct nfs3_createdata *data;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (data != NULL) {
+ data->msg.rpc_argp = &data->arg;
+ data->msg.rpc_resp = &data->res;
+ data->res.fh = &data->fh;
+ data->res.fattr = &data->fattr;
+ data->res.dir_attr = &data->dir_attr;
+ nfs_fattr_init(data->res.fattr);
+ nfs_fattr_init(data->res.dir_attr);
+ }
+ return data;
+}
+
+static int nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata *data)
+{
+ int status;
+
+ status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0);
+ nfs_post_op_update_inode(dir, data->res.dir_attr);
+ if (status == 0)
+ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+ return status;
+}
+
+static void nfs3_free_createdata(struct nfs3_createdata *data)
+{
+ kfree(data);
+}
+
/*
* Create a regular file.
* For now, we don't implement O_EXCL.
@@ -256,70 +305,60 @@ static int
nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
int flags, struct nameidata *nd)
{
- struct nfs_fh fhandle;
- struct nfs_fattr fattr;
- struct nfs_fattr dir_attr;
- struct nfs3_createargs arg = {
- .fh = NFS_FH(dir),
- .name = dentry->d_name.name,
- .len = dentry->d_name.len,
- .sattr = sattr,
- };
- struct nfs3_diropres res = {
- .dir_attr = &dir_attr,
- .fh = &fhandle,
- .fattr = &fattr
- };
- struct rpc_message msg = {
- .rpc_proc = &nfs3_procedures[NFS3PROC_CREATE],
- .rpc_argp = &arg,
- .rpc_resp = &res,
- };
+ struct nfs3_createdata *data;
mode_t mode = sattr->ia_mode;
- int status;
+ int status = -ENOMEM;
dprintk("NFS call create %s\n", dentry->d_name.name);
- arg.createmode = NFS3_CREATE_UNCHECKED;
+
+ data = nfs3_alloc_createdata();
+ if (data == NULL)
+ goto out;
+
+ data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_CREATE];
+ data->arg.create.fh = NFS_FH(dir);
+ data->arg.create.name = dentry->d_name.name;
+ data->arg.create.len = dentry->d_name.len;
+ data->arg.create.sattr = sattr;
+
+ data->arg.create.createmode = NFS3_CREATE_UNCHECKED;
if (flags & O_EXCL) {
- arg.createmode = NFS3_CREATE_EXCLUSIVE;
- arg.verifier[0] = jiffies;
- arg.verifier[1] = current->pid;
+ data->arg.create.createmode = NFS3_CREATE_EXCLUSIVE;
+ data->arg.create.verifier[0] = jiffies;
+ data->arg.create.verifier[1] = current->pid;
}
sattr->ia_mode &= ~current->fs->umask;
-again:
- nfs_fattr_init(&dir_attr);
- nfs_fattr_init(&fattr);
- status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
- nfs_refresh_inode(dir, &dir_attr);
+ for (;;) {
+ status = nfs3_do_create(dir, dentry, data);
- /* If the server doesn't support the exclusive creation semantics,
- * try again with simple 'guarded' mode. */
- if (status == -ENOTSUPP) {
- switch (arg.createmode) {
+ if (status != -ENOTSUPP)
+ break;
+ /* If the server doesn't support the exclusive creation
+ * semantics, try again with simple 'guarded' mode. */
+ switch (data->arg.create.createmode) {
case NFS3_CREATE_EXCLUSIVE:
- arg.createmode = NFS3_CREATE_GUARDED;
+ data->arg.create.createmode = NFS3_CREATE_GUARDED;
break;
case NFS3_CREATE_GUARDED:
- arg.createmode = NFS3_CREATE_UNCHECKED;
+ data->arg.create.createmode = NFS3_CREATE_UNCHECKED;
break;
case NFS3_CREATE_UNCHECKED:
goto out;
}
- goto again;
+ nfs_fattr_init(data->res.dir_attr);
+ nfs_fattr_init(data->res.fattr);
}
- if (status == 0)
- status = nfs_instantiate(dentry, &fhandle, &fattr);
if (status != 0)
goto out;
/* When we created the file with exclusive semantics, make
* sure we set the attributes afterwards. */
- if (arg.createmode == NFS3_CREATE_EXCLUSIVE) {
+ if (data->arg.create.createmode == NFS3_CREATE_EXCLUSIVE) {
dprintk("NFS call setattr (post-create)\n");
if (!(sattr->ia_valid & ATTR_ATIME_SET))
@@ -330,14 +369,15 @@ again:
/* Note: we could use a guarded setattr here, but I'm
* not sure this buys us anything (and I'd have
* to revamp the NFSv3 XDR code) */
- status = nfs3_proc_setattr(dentry, &fattr, sattr);
- nfs_post_op_update_inode(dentry->d_inode, &fattr);
+ status = nfs3_proc_setattr(dentry, data->res.fattr, sattr);
+ nfs_post_op_update_inode(dentry->d_inode, data->res.fattr);
dprintk("NFS reply setattr (post-create): %d\n", status);
+ if (status != 0)
+ goto out;
}
- if (status != 0)
- goto out;
status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode);
out:
+ nfs3_free_createdata(data);
dprintk("NFS reply create: %d\n", status);
return status;
}
@@ -452,40 +492,28 @@ static int
nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page,
unsigned int len, struct iattr *sattr)
{
- struct nfs_fh fhandle;
- struct nfs_fattr fattr, dir_attr;
- struct nfs3_symlinkargs arg = {
- .fromfh = NFS_FH(dir),
- .fromname = dentry->d_name.name,
- .fromlen = dentry->d_name.len,
- .pages = &page,
- .pathlen = len,
- .sattr = sattr
- };
- struct nfs3_diropres res = {
- .dir_attr = &dir_attr,
- .fh = &fhandle,
- .fattr = &fattr
- };
- struct rpc_message msg = {
- .rpc_proc = &nfs3_procedures[NFS3PROC_SYMLINK],
- .rpc_argp = &arg,
- .rpc_resp = &res,
- };
- int status;
+ struct nfs3_createdata *data;
+ int status = -ENOMEM;
if (len > NFS3_MAXPATHLEN)
return -ENAMETOOLONG;
dprintk("NFS call symlink %s\n", dentry->d_name.name);
- nfs_fattr_init(&dir_attr);
- nfs_fattr_init(&fattr);
- status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
- nfs_post_op_update_inode(dir, &dir_attr);
- if (status != 0)
+ data = nfs3_alloc_createdata();
+ if (data == NULL)
goto out;
- status = nfs_instantiate(dentry, &fhandle, &fattr);
+ data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_SYMLINK];
+ data->arg.symlink.fromfh = NFS_FH(dir);
+ data->arg.symlink.fromname = dentry->d_name.name;
+ data->arg.symlink.fromlen = dentry->d_name.len;
+ data->arg.symlink.pages = &page;
+ data->arg.symlink.pathlen = len;
+ data->arg.symlink.sattr = sattr;
+
+ status = nfs3_do_create(dir, dentry, data);
+
+ nfs3_free_createdata(data);
out:
dprintk("NFS reply symlink: %d\n", status);
return status;
@@ -494,42 +522,31 @@ out:
static int
nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
{
- struct nfs_fh fhandle;
- struct nfs_fattr fattr, dir_attr;
- struct nfs3_mkdirargs arg = {
- .fh = NFS_FH(dir),
- .name = dentry->d_name.name,
- .len = dentry->d_name.len,
- .sattr = sattr
- };
- struct nfs3_diropres res = {
- .dir_attr = &dir_attr,
- .fh = &fhandle,
- .fattr = &fattr
- };
- struct rpc_message msg = {
- .rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR],
- .rpc_argp = &arg,
- .rpc_resp = &res,
- };
+ struct nfs3_createdata *data;
int mode = sattr->ia_mode;
- int status;
+ int status = -ENOMEM;
dprintk("NFS call mkdir %s\n", dentry->d_name.name);
sattr->ia_mode &= ~current->fs->umask;
- nfs_fattr_init(&dir_attr);
- nfs_fattr_init(&fattr);
- status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
- nfs_post_op_update_inode(dir, &dir_attr);
- if (status != 0)
+ data = nfs3_alloc_createdata();
+ if (data == NULL)
goto out;
- status = nfs_instantiate(dentry, &fhandle, &fattr);
+
+ data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR];
+ data->arg.mkdir.fh = NFS_FH(dir);
+ data->arg.mkdir.name = dentry->d_name.name;
+ data->arg.mkdir.len = dentry->d_name.len;
+ data->arg.mkdir.sattr = sattr;
+
+ status = nfs3_do_create(dir, dentry, data);
if (status != 0)
goto out;
+
status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode);
out:
+ nfs3_free_createdata(data);
dprintk("NFS reply mkdir: %d\n", status);
return status;
}
@@ -615,52 +632,50 @@ static int
nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
dev_t rdev)
{
- struct nfs_fh fh;
- struct nfs_fattr fattr, dir_attr;
- struct nfs3_mknodargs arg = {
- .fh = NFS_FH(dir),
- .name = dentry->d_name.name,
- .len = dentry->d_name.len,
- .sattr = sattr,
- .rdev = rdev
- };
- struct nfs3_diropres res = {
- .dir_attr = &dir_attr,
- .fh = &fh,
- .fattr = &fattr
- };
- struct rpc_message msg = {
- .rpc_proc = &nfs3_procedures[NFS3PROC_MKNOD],
- .rpc_argp = &arg,
- .rpc_resp = &res,
- };
+ struct nfs3_createdata *data;
mode_t mode = sattr->ia_mode;
- int status;
-
- switch (sattr->ia_mode & S_IFMT) {
- case S_IFBLK: arg.type = NF3BLK; break;
- case S_IFCHR: arg.type = NF3CHR; break;
- case S_IFIFO: arg.type = NF3FIFO; break;
- case S_IFSOCK: arg.type = NF3SOCK; break;
- default: return -EINVAL;
- }
+ int status = -ENOMEM;
dprintk("NFS call mknod %s %u:%u\n", dentry->d_name.name,
MAJOR(rdev), MINOR(rdev));
sattr->ia_mode &= ~current->fs->umask;
- nfs_fattr_init(&dir_attr);
- nfs_fattr_init(&fattr);
- status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
- nfs_post_op_update_inode(dir, &dir_attr);
- if (status != 0)
+ data = nfs3_alloc_createdata();
+ if (data == NULL)
goto out;
- status = nfs_instantiate(dentry, &fh, &fattr);
+
+ data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKNOD];
+ data->arg.mknod.fh = NFS_FH(dir);
+ data->arg.mknod.name = dentry->d_name.name;
+ data->arg.mknod.len = dentry->d_name.len;
+ data->arg.mknod.sattr = sattr;
+ data->arg.mknod.rdev = rdev;
+
+ switch (sattr->ia_mode & S_IFMT) {
+ case S_IFBLK:
+ data->arg.mknod.type = NF3BLK;
+ break;
+ case S_IFCHR:
+ data->arg.mknod.type = NF3CHR;
+ break;
+ case S_IFIFO:
+ data->arg.mknod.type = NF3FIFO;
+ break;
+ case S_IFSOCK:
+ data->arg.mknod.type = NF3SOCK;
+ break;
+ default:
+ status = -EINVAL;
+ goto out;
+ }
+
+ status = nfs3_do_create(dir, dentry, data);
if (status != 0)
goto out;
status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode);
out:
+ nfs3_free_createdata(data);
dprintk("NFS reply mknod: %d\n", status);
return status;
}
@@ -801,8 +816,6 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
.write_done = nfs3_write_done,
.commit_setup = nfs3_proc_commit_setup,
.commit_done = nfs3_commit_done,
- .file_open = nfs_open,
- .file_release = nfs_release,
.lock = nfs3_proc_lock,
.clear_acl_cache = nfs3_forget_cached_acls,
};
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 1293e0acd82..c910413eaec 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -451,9 +451,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
/* Save the delegation */
memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data));
rcu_read_unlock();
- lock_kernel();
ret = nfs_may_open(state->inode, state->owner->so_cred, open_mode);
- unlock_kernel();
if (ret != 0)
goto out;
ret = -EAGAIN;
@@ -1139,8 +1137,9 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, int
return res;
}
-static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr,
- struct iattr *sattr, struct nfs4_state *state)
+static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
+ struct nfs_fattr *fattr, struct iattr *sattr,
+ struct nfs4_state *state)
{
struct nfs_server *server = NFS_SERVER(inode);
struct nfs_setattrargs arg = {
@@ -1154,9 +1153,10 @@ static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr,
.server = server,
};
struct rpc_message msg = {
- .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
- .rpc_argp = &arg,
- .rpc_resp = &res,
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
+ .rpc_argp = &arg,
+ .rpc_resp = &res,
+ .rpc_cred = cred,
};
unsigned long timestamp = jiffies;
int status;
@@ -1166,7 +1166,6 @@ static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr,
if (nfs4_copy_delegation_stateid(&arg.stateid, inode)) {
/* Use that stateid */
} else if (state != NULL) {
- msg.rpc_cred = state->owner->so_cred;
nfs4_copy_stateid(&arg.stateid, state, current->files);
} else
memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid));
@@ -1177,15 +1176,16 @@ static int _nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr,
return status;
}
-static int nfs4_do_setattr(struct inode *inode, struct nfs_fattr *fattr,
- struct iattr *sattr, struct nfs4_state *state)
+static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
+ struct nfs_fattr *fattr, struct iattr *sattr,
+ struct nfs4_state *state)
{
struct nfs_server *server = NFS_SERVER(inode);
struct nfs4_exception exception = { };
int err;
do {
err = nfs4_handle_exception(server,
- _nfs4_do_setattr(inode, fattr, sattr, state),
+ _nfs4_do_setattr(inode, cred, fattr, sattr, state),
&exception);
} while (exception.retry);
return err;
@@ -1647,29 +1647,25 @@ static int
nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
struct iattr *sattr)
{
- struct rpc_cred *cred;
struct inode *inode = dentry->d_inode;
- struct nfs_open_context *ctx;
+ struct rpc_cred *cred = NULL;
struct nfs4_state *state = NULL;
int status;
nfs_fattr_init(fattr);
- cred = rpc_lookup_cred();
- if (IS_ERR(cred))
- return PTR_ERR(cred);
-
/* Search for an existing open(O_WRITE) file */
- ctx = nfs_find_open_context(inode, cred, FMODE_WRITE);
- if (ctx != NULL)
+ if (sattr->ia_valid & ATTR_FILE) {
+ struct nfs_open_context *ctx;
+
+ ctx = nfs_file_open_context(sattr->ia_file);
+ cred = ctx->cred;
state = ctx->state;
+ }
- status = nfs4_do_setattr(inode, fattr, sattr, state);
+ status = nfs4_do_setattr(inode, cred, fattr, sattr, state);
if (status == 0)
nfs_setattr_update_inode(inode, sattr);
- if (ctx != NULL)
- put_nfs_open_context(ctx);
- put_rpccred(cred);
return status;
}
@@ -1897,17 +1893,16 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
goto out;
}
state = nfs4_do_open(dir, &path, flags, sattr, cred);
- put_rpccred(cred);
d_drop(dentry);
if (IS_ERR(state)) {
status = PTR_ERR(state);
- goto out;
+ goto out_putcred;
}
d_add(dentry, igrab(state->inode));
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
if (flags & O_EXCL) {
struct nfs_fattr fattr;
- status = nfs4_do_setattr(state->inode, &fattr, sattr, state);
+ status = nfs4_do_setattr(state->inode, cred, &fattr, sattr, state);
if (status == 0)
nfs_setattr_update_inode(state->inode, sattr);
nfs_post_op_update_inode(state->inode, &fattr);
@@ -1916,6 +1911,8 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
status = nfs4_intent_set_file(nd, &path, state);
else
nfs4_close_sync(&path, state, flags);
+out_putcred:
+ put_rpccred(cred);
out:
return status;
}
@@ -2079,47 +2076,81 @@ static int nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *n
return err;
}
+struct nfs4_createdata {
+ struct rpc_message msg;
+ struct nfs4_create_arg arg;
+ struct nfs4_create_res res;
+ struct nfs_fh fh;
+ struct nfs_fattr fattr;
+ struct nfs_fattr dir_fattr;
+};
+
+static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
+ struct qstr *name, struct iattr *sattr, u32 ftype)
+{
+ struct nfs4_createdata *data;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (data != NULL) {
+ struct nfs_server *server = NFS_SERVER(dir);
+
+ data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE];
+ data->msg.rpc_argp = &data->arg;
+ data->msg.rpc_resp = &data->res;
+ data->arg.dir_fh = NFS_FH(dir);
+ data->arg.server = server;
+ data->arg.name = name;
+ data->arg.attrs = sattr;
+ data->arg.ftype = ftype;
+ data->arg.bitmask = server->attr_bitmask;
+ data->res.server = server;
+ data->res.fh = &data->fh;
+ data->res.fattr = &data->fattr;
+ data->res.dir_fattr = &data->dir_fattr;
+ nfs_fattr_init(data->res.fattr);
+ nfs_fattr_init(data->res.dir_fattr);
+ }
+ return data;
+}
+
+static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data)
+{
+ int status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0);
+ if (status == 0) {
+ update_changeattr(dir, &data->res.dir_cinfo);
+ nfs_post_op_update_inode(dir, data->res.dir_fattr);
+ status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+ }
+ return status;
+}
+
+static void nfs4_free_createdata(struct nfs4_createdata *data)
+{
+ kfree(data);
+}
+
static int _nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
struct page *page, unsigned int len, struct iattr *sattr)
{
- struct nfs_server *server = NFS_SERVER(dir);
- struct nfs_fh fhandle;
- struct nfs_fattr fattr, dir_fattr;
- struct nfs4_create_arg arg = {
- .dir_fh = NFS_FH(dir),
- .server = server,
- .name = &dentry->d_name,
- .attrs = sattr,
- .ftype = NF4LNK,
- .bitmask = server->attr_bitmask,
- };
- struct nfs4_create_res res = {
- .server = server,
- .fh = &fhandle,
- .fattr = &fattr,
- .dir_fattr = &dir_fattr,
- };
- struct rpc_message msg = {
- .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK],
- .rpc_argp = &arg,
- .rpc_resp = &res,
- };
- int status;
+ struct nfs4_createdata *data;
+ int status = -ENAMETOOLONG;
if (len > NFS4_MAXPATHLEN)
- return -ENAMETOOLONG;
+ goto out;
- arg.u.symlink.pages = &page;
- arg.u.symlink.len = len;
- nfs_fattr_init(&fattr);
- nfs_fattr_init(&dir_fattr);
+ status = -ENOMEM;
+ data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4LNK);
+ if (data == NULL)
+ goto out;
+
+ data->msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SYMLINK];
+ data->arg.u.symlink.pages = &page;
+ data->arg.u.symlink.len = len;
- status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
- if (!status) {
- update_changeattr(dir, &res.dir_cinfo);
- nfs_post_op_update_inode(dir, res.dir_fattr);
- status = nfs_instantiate(dentry, &fhandle, &fattr);
- }
+ status = nfs4_do_create(dir, dentry, data);
+
+ nfs4_free_createdata(data);
+out:
return status;
}
@@ -2140,39 +2171,17 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
struct iattr *sattr)
{
- struct nfs_server *server = NFS_SERVER(dir);
- struct nfs_fh fhandle;
- struct nfs_fattr fattr, dir_fattr;
- struct nfs4_create_arg arg = {
- .dir_fh = NFS_FH(dir),
- .server = server,
- .name = &dentry->d_name,
- .attrs = sattr,
- .ftype = NF4DIR,
- .bitmask = server->attr_bitmask,
- };
- struct nfs4_create_res res = {
- .server = server,
- .fh = &fhandle,
- .fattr = &fattr,
- .dir_fattr = &dir_fattr,
- };
- struct rpc_message msg = {
- .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE],
- .rpc_argp = &arg,
- .rpc_resp = &res,
- };
- int status;
+ struct nfs4_createdata *data;
+ int status = -ENOMEM;
- nfs_fattr_init(&fattr);
- nfs_fattr_init(&dir_fattr);
-
- status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
- if (!status) {
- update_changeattr(dir, &res.dir_cinfo);
- nfs_post_op_update_inode(dir, res.dir_fattr);
- status = nfs_instantiate(dentry, &fhandle, &fattr);
- }
+ data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4DIR);
+ if (data == NULL)
+ goto out;
+
+ status = nfs4_do_create(dir, dentry, data);
+
+ nfs4_free_createdata(data);
+out:
return status;
}
@@ -2242,56 +2251,34 @@ static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
static int _nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
struct iattr *sattr, dev_t rdev)
{
- struct nfs_server *server = NFS_SERVER(dir);
- struct nfs_fh fh;
- struct nfs_fattr fattr, dir_fattr;
- struct nfs4_create_arg arg = {
- .dir_fh = NFS_FH(dir),
- .server = server,
- .name = &dentry->d_name,
- .attrs = sattr,
- .bitmask = server->attr_bitmask,
- };
- struct nfs4_create_res res = {
- .server = server,
- .fh = &fh,
- .fattr = &fattr,
- .dir_fattr = &dir_fattr,
- };
- struct rpc_message msg = {
- .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE],
- .rpc_argp = &arg,
- .rpc_resp = &res,
- };
- int status;
- int mode = sattr->ia_mode;
-
- nfs_fattr_init(&fattr);
- nfs_fattr_init(&dir_fattr);
+ struct nfs4_createdata *data;
+ int mode = sattr->ia_mode;
+ int status = -ENOMEM;
BUG_ON(!(sattr->ia_valid & ATTR_MODE));
BUG_ON(!S_ISFIFO(mode) && !S_ISBLK(mode) && !S_ISCHR(mode) && !S_ISSOCK(mode));
+
+ data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4SOCK);
+ if (data == NULL)
+ goto out;
+
if (S_ISFIFO(mode))
- arg.ftype = NF4FIFO;
+ data->arg.ftype = NF4FIFO;
else if (S_ISBLK(mode)) {
- arg.ftype = NF4BLK;
- arg.u.device.specdata1 = MAJOR(rdev);
- arg.u.device.specdata2 = MINOR(rdev);
+ data->arg.ftype = NF4BLK;
+ data->arg.u.device.specdata1 = MAJOR(rdev);
+ data->arg.u.device.specdata2 = MINOR(rdev);
}
else if (S_ISCHR(mode)) {
- arg.ftype = NF4CHR;
- arg.u.device.specdata1 = MAJOR(rdev);
- arg.u.device.specdata2 = MINOR(rdev);
+ data->arg.ftype = NF4CHR;
+ data->arg.u.device.specdata1 = MAJOR(rdev);
+ data->arg.u.device.specdata2 = MINOR(rdev);
}
- else
- arg.ftype = NF4SOCK;
- status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
- if (status == 0) {
- update_changeattr(dir, &res.dir_cinfo);
- nfs_post_op_update_inode(dir, res.dir_fattr);
- status = nfs_instantiate(dentry, &fh, &fattr);
- }
+ status = nfs4_do_create(dir, dentry, data);
+
+ nfs4_free_createdata(data);
+out:
return status;
}
@@ -2706,6 +2693,8 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)
ret = nfs_revalidate_inode(server, inode);
if (ret < 0)
return ret;
+ if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_ACL)
+ nfs_zap_acl_cache(inode);
ret = nfs4_read_cached_acl(inode, buf, buflen);
if (ret != -ENOENT)
return ret;
@@ -2733,7 +2722,8 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
nfs_inode_return_delegation(inode);
buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
- nfs_zap_caches(inode);
+ nfs_access_zap_cache(inode);
+ nfs_zap_acl_cache(inode);
return ret;
}
@@ -2767,8 +2757,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server)
task->tk_status = 0;
return -EAGAIN;
case -NFS4ERR_DELAY:
- nfs_inc_server_stats((struct nfs_server *) server,
- NFSIOS_DELAY);
+ nfs_inc_server_stats(server, NFSIOS_DELAY);
case -NFS4ERR_GRACE:
rpc_delay(task, NFS4_POLL_RETRY_MAX);
task->tk_status = 0;
@@ -2933,7 +2922,7 @@ static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cre
int nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cred *cred)
{
- long timeout;
+ long timeout = 0;
int err;
do {
err = _nfs4_proc_setclientid_confirm(clp, cred);
@@ -3725,8 +3714,6 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
.write_done = nfs4_write_done,
.commit_setup = nfs4_proc_commit_setup,
.commit_done = nfs4_commit_done,
- .file_open = nfs_open,
- .file_release = nfs_release,
.lock = nfs4_proc_lock,
.clear_acl_cache = nfs4_zap_acl_attr,
};
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 856a8934f61..401ef8b28f9 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -940,7 +940,6 @@ static int reclaimer(void *ptr)
allow_signal(SIGKILL);
/* Ensure exclusive access to NFSv4 state */
- lock_kernel();
down_write(&clp->cl_sem);
/* Are there any NFS mounts out there? */
if (list_empty(&clp->cl_superblocks))
@@ -1000,7 +999,6 @@ restart_loop:
nfs_delegation_reap_unclaimed(clp);
out:
up_write(&clp->cl_sem);
- unlock_kernel();
if (status == -NFS4ERR_CB_PATH_DOWN)
nfs_handle_cb_pathdown(clp);
nfs4_clear_recover_bit(clp);
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 531379d3682..8478fc25dae 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -1,6 +1,4 @@
/*
- * $Id: nfsroot.c,v 1.45 1998/03/07 10:44:46 mj Exp $
- *
* Copyright (C) 1995, 1996 Gero Kuhlmann <gero@gkminix.han.de>
*
* Allow an NFS filesystem to be mounted as root. The way this works is:
@@ -129,7 +127,7 @@ enum {
Opt_err
};
-static match_table_t __initdata tokens = {
+static match_table_t __initconst tokens = {
{Opt_port, "port=%u"},
{Opt_rsize, "rsize=%u"},
{Opt_wsize, "wsize=%u"},
@@ -297,10 +295,10 @@ static int __init root_nfs_name(char *name)
nfs_data.flags = NFS_MOUNT_NONLM; /* No lockd in nfs root yet */
nfs_data.rsize = NFS_DEF_FILE_IO_SIZE;
nfs_data.wsize = NFS_DEF_FILE_IO_SIZE;
- nfs_data.acregmin = 3;
- nfs_data.acregmax = 60;
- nfs_data.acdirmin = 30;
- nfs_data.acdirmax = 60;
+ nfs_data.acregmin = NFS_DEF_ACREGMIN;
+ nfs_data.acregmax = NFS_DEF_ACREGMAX;
+ nfs_data.acdirmin = NFS_DEF_ACDIRMIN;
+ nfs_data.acdirmax = NFS_DEF_ACDIRMAX;
strcpy(buf, NFS_ROOT);
/* Process options received from the remote server */
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 03599bfe81c..4dbb84df1b6 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -129,6 +129,8 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
sattr->ia_mode &= S_IALLUGO;
dprintk("NFS call setattr\n");
+ if (sattr->ia_valid & ATTR_FILE)
+ msg.rpc_cred = nfs_file_cred(sattr->ia_file);
nfs_fattr_init(fattr);
status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
if (status == 0)
@@ -598,6 +600,29 @@ nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl);
}
+/* Helper functions for NFS lock bounds checking */
+#define NFS_LOCK32_OFFSET_MAX ((__s32)0x7fffffffUL)
+static int nfs_lock_check_bounds(const struct file_lock *fl)
+{
+ __s32 start, end;
+
+ start = (__s32)fl->fl_start;
+ if ((loff_t)start != fl->fl_start)
+ goto out_einval;
+
+ if (fl->fl_end != OFFSET_MAX) {
+ end = (__s32)fl->fl_end;
+ if ((loff_t)end != fl->fl_end)
+ goto out_einval;
+ } else
+ end = NFS_LOCK32_OFFSET_MAX;
+
+ if (start < 0 || start > end)
+ goto out_einval;
+ return 0;
+out_einval:
+ return -EINVAL;
+}
const struct nfs_rpc_ops nfs_v2_clientops = {
.version = 2, /* protocol version */
@@ -630,7 +655,6 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
.write_setup = nfs_proc_write_setup,
.write_done = nfs_write_done,
.commit_setup = nfs_proc_commit_setup,
- .file_open = nfs_open,
- .file_release = nfs_release,
.lock = nfs_proc_lock,
+ .lock_check_bounds = nfs_lock_check_bounds,
};
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 614efeed543..ffb697416cb 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -47,6 +47,7 @@
#include <linux/inet.h>
#include <linux/in6.h>
#include <net/ipv6.h>
+#include <linux/netdevice.h>
#include <linux/nfs_xdr.h>
#include <linux/magic.h>
#include <linux/parser.h>
@@ -65,7 +66,6 @@
enum {
/* Mount options that take no arguments */
Opt_soft, Opt_hard,
- Opt_intr, Opt_nointr,
Opt_posix, Opt_noposix,
Opt_cto, Opt_nocto,
Opt_ac, Opt_noac,
@@ -92,19 +92,23 @@ enum {
Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost,
Opt_addr, Opt_mountaddr, Opt_clientaddr,
- /* Mount options that are ignored */
- Opt_userspace, Opt_deprecated,
+ /* Special mount options */
+ Opt_userspace, Opt_deprecated, Opt_sloppy,
Opt_err
};
-static match_table_t nfs_mount_option_tokens = {
+static const match_table_t nfs_mount_option_tokens = {
{ Opt_userspace, "bg" },
{ Opt_userspace, "fg" },
+ { Opt_userspace, "retry=%s" },
+
+ { Opt_sloppy, "sloppy" },
+
{ Opt_soft, "soft" },
{ Opt_hard, "hard" },
- { Opt_intr, "intr" },
- { Opt_nointr, "nointr" },
+ { Opt_deprecated, "intr" },
+ { Opt_deprecated, "nointr" },
{ Opt_posix, "posix" },
{ Opt_noposix, "noposix" },
{ Opt_cto, "cto" },
@@ -136,7 +140,6 @@ static match_table_t nfs_mount_option_tokens = {
{ Opt_acdirmin, "acdirmin=%u" },
{ Opt_acdirmax, "acdirmax=%u" },
{ Opt_actimeo, "actimeo=%u" },
- { Opt_userspace, "retry=%u" },
{ Opt_namelen, "namlen=%u" },
{ Opt_mountport, "mountport=%u" },
{ Opt_mountvers, "mountvers=%u" },
@@ -160,7 +163,7 @@ enum {
Opt_xprt_err
};
-static match_table_t nfs_xprt_protocol_tokens = {
+static const match_table_t nfs_xprt_protocol_tokens = {
{ Opt_xprt_udp, "udp" },
{ Opt_xprt_tcp, "tcp" },
{ Opt_xprt_rdma, "rdma" },
@@ -177,7 +180,7 @@ enum {
Opt_sec_err
};
-static match_table_t nfs_secflavor_tokens = {
+static const match_table_t nfs_secflavor_tokens = {
{ Opt_sec_none, "none" },
{ Opt_sec_none, "null" },
{ Opt_sec_sys, "sys" },
@@ -207,6 +210,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type,
int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
static void nfs_kill_super(struct super_block *);
static void nfs_put_super(struct super_block *);
+static int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
static struct file_system_type nfs_fs_type = {
.owner = THIS_MODULE,
@@ -234,6 +238,7 @@ static const struct super_operations nfs_sops = {
.umount_begin = nfs_umount_begin,
.show_options = nfs_show_options,
.show_stats = nfs_show_stats,
+ .remount_fs = nfs_remount,
};
#ifdef CONFIG_NFS_V4
@@ -278,6 +283,7 @@ static const struct super_operations nfs4_sops = {
.umount_begin = nfs_umount_begin,
.show_options = nfs_show_options,
.show_stats = nfs_show_stats,
+ .remount_fs = nfs_remount,
};
#endif
@@ -368,8 +374,6 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
};
int error;
- lock_kernel();
-
error = server->nfs_client->rpc_ops->statfs(server, fh, &res);
if (error < 0)
goto out_err;
@@ -401,12 +405,10 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_namelen = server->namelen;
- unlock_kernel();
return 0;
out_err:
dprintk("%s: statfs error = %d\n", __func__, -error);
- unlock_kernel();
return error;
}
@@ -514,13 +516,13 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
if (nfss->bsize != 0)
seq_printf(m, ",bsize=%u", nfss->bsize);
seq_printf(m, ",namlen=%u", nfss->namelen);
- if (nfss->acregmin != 3*HZ || showdefaults)
+ if (nfss->acregmin != NFS_DEF_ACREGMIN*HZ || showdefaults)
seq_printf(m, ",acregmin=%u", nfss->acregmin/HZ);
- if (nfss->acregmax != 60*HZ || showdefaults)
+ if (nfss->acregmax != NFS_DEF_ACREGMAX*HZ || showdefaults)
seq_printf(m, ",acregmax=%u", nfss->acregmax/HZ);
- if (nfss->acdirmin != 30*HZ || showdefaults)
+ if (nfss->acdirmin != NFS_DEF_ACDIRMIN*HZ || showdefaults)
seq_printf(m, ",acdirmin=%u", nfss->acdirmin/HZ);
- if (nfss->acdirmax != 60*HZ || showdefaults)
+ if (nfss->acdirmax != NFS_DEF_ACDIRMAX*HZ || showdefaults)
seq_printf(m, ",acdirmax=%u", nfss->acdirmax/HZ);
for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) {
if (nfss->flags & nfs_infop->flag)
@@ -702,49 +704,233 @@ static int nfs_verify_server_address(struct sockaddr *addr)
return 0;
}
+static void nfs_parse_ipv4_address(char *string, size_t str_len,
+ struct sockaddr *sap, size_t *addr_len)
+{
+ struct sockaddr_in *sin = (struct sockaddr_in *)sap;
+ u8 *addr = (u8 *)&sin->sin_addr.s_addr;
+
+ if (str_len <= INET_ADDRSTRLEN) {
+ dfprintk(MOUNT, "NFS: parsing IPv4 address %*s\n",
+ (int)str_len, string);
+
+ sin->sin_family = AF_INET;
+ *addr_len = sizeof(*sin);
+ if (in4_pton(string, str_len, addr, '\0', NULL))
+ return;
+ }
+
+ sap->sa_family = AF_UNSPEC;
+ *addr_len = 0;
+}
+
+#define IPV6_SCOPE_DELIMITER '%'
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static void nfs_parse_ipv6_scope_id(const char *string, const size_t str_len,
+ const char *delim,
+ struct sockaddr_in6 *sin6)
+{
+ char *p;
+ size_t len;
+
+ if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL))
+ return ;
+ if (*delim != IPV6_SCOPE_DELIMITER)
+ return;
+
+ len = (string + str_len) - delim - 1;
+ p = kstrndup(delim + 1, len, GFP_KERNEL);
+ if (p) {
+ unsigned long scope_id = 0;
+ struct net_device *dev;
+
+ dev = dev_get_by_name(&init_net, p);
+ if (dev != NULL) {
+ scope_id = dev->ifindex;
+ dev_put(dev);
+ } else {
+ /* scope_id is set to zero on error */
+ strict_strtoul(p, 10, &scope_id);
+ }
+
+ kfree(p);
+ sin6->sin6_scope_id = scope_id;
+ dfprintk(MOUNT, "NFS: IPv6 scope ID = %lu\n", scope_id);
+ }
+}
+
+static void nfs_parse_ipv6_address(char *string, size_t str_len,
+ struct sockaddr *sap, size_t *addr_len)
+{
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
+ u8 *addr = (u8 *)&sin6->sin6_addr.in6_u;
+ const char *delim;
+
+ if (str_len <= INET6_ADDRSTRLEN) {
+ dfprintk(MOUNT, "NFS: parsing IPv6 address %*s\n",
+ (int)str_len, string);
+
+ sin6->sin6_family = AF_INET6;
+ *addr_len = sizeof(*sin6);
+ if (in6_pton(string, str_len, addr, IPV6_SCOPE_DELIMITER, &delim)) {
+ nfs_parse_ipv6_scope_id(string, str_len, delim, sin6);
+ return;
+ }
+ }
+
+ sap->sa_family = AF_UNSPEC;
+ *addr_len = 0;
+}
+#else
+static void nfs_parse_ipv6_address(char *string, size_t str_len,
+ struct sockaddr *sap, size_t *addr_len)
+{
+ sap->sa_family = AF_UNSPEC;
+ *addr_len = 0;
+}
+#endif
+
/*
- * Parse string addresses passed in via a mount option,
- * and construct a sockaddr based on the result.
+ * Construct a sockaddr based on the contents of a string that contains
+ * an IP address in presentation format.
*
- * If address parsing fails, set the sockaddr's address
- * family to AF_UNSPEC to force nfs_verify_server_address()
- * to punt the mount.
+ * If there is a problem constructing the new sockaddr, set the address
+ * family to AF_UNSPEC.
*/
-static void nfs_parse_server_address(char *value,
- struct sockaddr *sap,
- size_t *len)
+static void nfs_parse_ip_address(char *string, size_t str_len,
+ struct sockaddr *sap, size_t *addr_len)
{
- if (strchr(value, ':')) {
- struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap;
- u8 *addr = (u8 *)&ap->sin6_addr.in6_u;
+ unsigned int i, colons;
- ap->sin6_family = AF_INET6;
- *len = sizeof(*ap);
- if (in6_pton(value, -1, addr, '\0', NULL))
- return;
- } else {
- struct sockaddr_in *ap = (struct sockaddr_in *)sap;
- u8 *addr = (u8 *)&ap->sin_addr.s_addr;
+ colons = 0;
+ for (i = 0; i < str_len; i++)
+ if (string[i] == ':')
+ colons++;
+
+ if (colons >= 2)
+ nfs_parse_ipv6_address(string, str_len, sap, addr_len);
+ else
+ nfs_parse_ipv4_address(string, str_len, sap, addr_len);
+}
+
+/*
+ * Sanity check the NFS transport protocol.
+ *
+ */
+static void nfs_validate_transport_protocol(struct nfs_parsed_mount_data *mnt)
+{
+ switch (mnt->nfs_server.protocol) {
+ case XPRT_TRANSPORT_UDP:
+ case XPRT_TRANSPORT_TCP:
+ case XPRT_TRANSPORT_RDMA:
+ break;
+ default:
+ mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+ }
+}
+
+/*
+ * For text based NFSv2/v3 mounts, the mount protocol transport default
+ * settings should depend upon the specified NFS transport.
+ */
+static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt)
+{
+ nfs_validate_transport_protocol(mnt);
- ap->sin_family = AF_INET;
- *len = sizeof(*ap);
- if (in4_pton(value, -1, addr, '\0', NULL))
+ if (mnt->mount_server.protocol == XPRT_TRANSPORT_UDP ||
+ mnt->mount_server.protocol == XPRT_TRANSPORT_TCP)
return;
+ switch (mnt->nfs_server.protocol) {
+ case XPRT_TRANSPORT_UDP:
+ mnt->mount_server.protocol = XPRT_TRANSPORT_UDP;
+ break;
+ case XPRT_TRANSPORT_TCP:
+ case XPRT_TRANSPORT_RDMA:
+ mnt->mount_server.protocol = XPRT_TRANSPORT_TCP;
}
+}
- sap->sa_family = AF_UNSPEC;
- *len = 0;
+/*
+ * Parse the value of the 'sec=' option.
+ *
+ * The flavor_len setting is for v4 mounts.
+ */
+static int nfs_parse_security_flavors(char *value,
+ struct nfs_parsed_mount_data *mnt)
+{
+ substring_t args[MAX_OPT_ARGS];
+
+ dfprintk(MOUNT, "NFS: parsing sec=%s option\n", value);
+
+ switch (match_token(value, nfs_secflavor_tokens, args)) {
+ case Opt_sec_none:
+ mnt->auth_flavor_len = 0;
+ mnt->auth_flavors[0] = RPC_AUTH_NULL;
+ break;
+ case Opt_sec_sys:
+ mnt->auth_flavor_len = 0;
+ mnt->auth_flavors[0] = RPC_AUTH_UNIX;
+ break;
+ case Opt_sec_krb5:
+ mnt->auth_flavor_len = 1;
+ mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5;
+ break;
+ case Opt_sec_krb5i:
+ mnt->auth_flavor_len = 1;
+ mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I;
+ break;
+ case Opt_sec_krb5p:
+ mnt->auth_flavor_len = 1;
+ mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P;
+ break;
+ case Opt_sec_lkey:
+ mnt->auth_flavor_len = 1;
+ mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY;
+ break;
+ case Opt_sec_lkeyi:
+ mnt->auth_flavor_len = 1;
+ mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI;
+ break;
+ case Opt_sec_lkeyp:
+ mnt->auth_flavor_len = 1;
+ mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP;
+ break;
+ case Opt_sec_spkm:
+ mnt->auth_flavor_len = 1;
+ mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM;
+ break;
+ case Opt_sec_spkmi:
+ mnt->auth_flavor_len = 1;
+ mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI;
+ break;
+ case Opt_sec_spkmp:
+ mnt->auth_flavor_len = 1;
+ mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP;
+ break;
+ default:
+ return 0;
+ }
+
+ return 1;
+}
+
+static void nfs_parse_invalid_value(const char *option)
+{
+ dfprintk(MOUNT, "NFS: bad value specified for %s option\n", option);
}
/*
* Error-check and convert a string of mount options from user space into
- * a data structure
+ * a data structure. The whole mount string is processed; bad options are
+ * skipped as they are encountered. If there were no errors, return 1;
+ * otherwise return 0 (zero).
*/
static int nfs_parse_mount_options(char *raw,
struct nfs_parsed_mount_data *mnt)
{
char *p, *string, *secdata;
- int rc;
+ int rc, sloppy = 0, errors = 0;
if (!raw) {
dfprintk(MOUNT, "NFS: mount options string was NULL.\n");
@@ -777,15 +963,16 @@ static int nfs_parse_mount_options(char *raw,
token = match_token(p, nfs_mount_option_tokens, args);
switch (token) {
+
+ /*
+ * boolean options: foo/nofoo
+ */
case Opt_soft:
mnt->flags |= NFS_MOUNT_SOFT;
break;
case Opt_hard:
mnt->flags &= ~NFS_MOUNT_SOFT;
break;
- case Opt_intr:
- case Opt_nointr:
- break;
case Opt_posix:
mnt->flags |= NFS_MOUNT_POSIX;
break;
@@ -819,20 +1006,14 @@ static int nfs_parse_mount_options(char *raw,
case Opt_udp:
mnt->flags &= ~NFS_MOUNT_TCP;
mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
- mnt->timeo = 7;
- mnt->retrans = 5;
break;
case Opt_tcp:
mnt->flags |= NFS_MOUNT_TCP;
mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
- mnt->timeo = 600;
- mnt->retrans = 2;
break;
case Opt_rdma:
mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */
mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
- mnt->timeo = 600;
- mnt->retrans = 2;
break;
case Opt_acl:
mnt->flags &= ~NFS_MOUNT_NOACL;
@@ -853,165 +1034,144 @@ static int nfs_parse_mount_options(char *raw,
mnt->flags |= NFS_MOUNT_UNSHARED;
break;
+ /*
+ * options that take numeric values
+ */
case Opt_port:
- if (match_int(args, &option))
- return 0;
- if (option < 0 || option > 65535)
- return 0;
- mnt->nfs_server.port = option;
+ if (match_int(args, &option) ||
+ option < 0 || option > USHORT_MAX) {
+ errors++;
+ nfs_parse_invalid_value("port");
+ } else
+ mnt->nfs_server.port = option;
break;
case Opt_rsize:
- if (match_int(args, &mnt->rsize))
- return 0;
+ if (match_int(args, &option) || option < 0) {
+ errors++;
+ nfs_parse_invalid_value("rsize");
+ } else
+ mnt->rsize = option;
break;
case Opt_wsize:
- if (match_int(args, &mnt->wsize))
- return 0;
+ if (match_int(args, &option) || option < 0) {
+ errors++;
+ nfs_parse_invalid_value("wsize");
+ } else
+ mnt->wsize = option;
break;
case Opt_bsize:
- if (match_int(args, &option))
- return 0;
- if (option < 0)
- return 0;
- mnt->bsize = option;
+ if (match_int(args, &option) || option < 0) {
+ errors++;
+ nfs_parse_invalid_value("bsize");
+ } else
+ mnt->bsize = option;
break;
case Opt_timeo:
- if (match_int(args, &mnt->timeo))
- return 0;
+ if (match_int(args, &option) || option <= 0) {
+ errors++;
+ nfs_parse_invalid_value("timeo");
+ } else
+ mnt->timeo = option;
break;
case Opt_retrans:
- if (match_int(args, &mnt->retrans))
- return 0;
+ if (match_int(args, &option) || option <= 0) {
+ errors++;
+ nfs_parse_invalid_value("retrans");
+ } else
+ mnt->retrans = option;
break;
case Opt_acregmin:
- if (match_int(args, &mnt->acregmin))
- return 0;
+ if (match_int(args, &option) || option < 0) {
+ errors++;
+ nfs_parse_invalid_value("acregmin");
+ } else
+ mnt->acregmin = option;
break;
case Opt_acregmax:
- if (match_int(args, &mnt->acregmax))
- return 0;
+ if (match_int(args, &option) || option < 0) {
+ errors++;
+ nfs_parse_invalid_value("acregmax");
+ } else
+ mnt->acregmax = option;
break;
case Opt_acdirmin:
- if (match_int(args, &mnt->acdirmin))
- return 0;
+ if (match_int(args, &option) || option < 0) {
+ errors++;
+ nfs_parse_invalid_value("acdirmin");
+ } else
+ mnt->acdirmin = option;
break;
case Opt_acdirmax:
- if (match_int(args, &mnt->acdirmax))
- return 0;
+ if (match_int(args, &option) || option < 0) {
+ errors++;
+ nfs_parse_invalid_value("acdirmax");
+ } else
+ mnt->acdirmax = option;
break;
case Opt_actimeo:
- if (match_int(args, &option))
- return 0;
- if (option < 0)
- return 0;
- mnt->acregmin =
- mnt->acregmax =
- mnt->acdirmin =
- mnt->acdirmax = option;
+ if (match_int(args, &option) || option < 0) {
+ errors++;
+ nfs_parse_invalid_value("actimeo");
+ } else
+ mnt->acregmin = mnt->acregmax =
+ mnt->acdirmin = mnt->acdirmax = option;
break;
case Opt_namelen:
- if (match_int(args, &mnt->namlen))
- return 0;
+ if (match_int(args, &option) || option < 0) {
+ errors++;
+ nfs_parse_invalid_value("namlen");
+ } else
+ mnt->namlen = option;
break;
case Opt_mountport:
- if (match_int(args, &option))
- return 0;
- if (option < 0 || option > 65535)
- return 0;
- mnt->mount_server.port = option;
+ if (match_int(args, &option) ||
+ option < 0 || option > USHORT_MAX) {
+ errors++;
+ nfs_parse_invalid_value("mountport");
+ } else
+ mnt->mount_server.port = option;
break;
case Opt_mountvers:
- if (match_int(args, &option))
- return 0;
- if (option < 0)
- return 0;
- mnt->mount_server.version = option;
+ if (match_int(args, &option) ||
+ option < NFS_MNT_VERSION ||
+ option > NFS_MNT3_VERSION) {
+ errors++;
+ nfs_parse_invalid_value("mountvers");
+ } else
+ mnt->mount_server.version = option;
break;
case Opt_nfsvers:
- if (match_int(args, &option))
- return 0;
+ if (match_int(args, &option)) {
+ errors++;
+ nfs_parse_invalid_value("nfsvers");
+ break;
+ }
switch (option) {
- case 2:
+ case NFS2_VERSION:
mnt->flags &= ~NFS_MOUNT_VER3;
break;
- case 3:
+ case NFS3_VERSION:
mnt->flags |= NFS_MOUNT_VER3;
break;
default:
- goto out_unrec_vers;
+ errors++;
+ nfs_parse_invalid_value("nfsvers");
}
break;
+ /*
+ * options that take text values
+ */
case Opt_sec:
string = match_strdup(args);
if (string == NULL)
goto out_nomem;
- token = match_token(string, nfs_secflavor_tokens, args);
+ rc = nfs_parse_security_flavors(string, mnt);
kfree(string);
-
- /*
- * The flags setting is for v2/v3. The flavor_len
- * setting is for v4. v2/v3 also need to know the
- * difference between NULL and UNIX.
- */
- switch (token) {
- case Opt_sec_none:
- mnt->flags &= ~NFS_MOUNT_SECFLAVOUR;
- mnt->auth_flavor_len = 0;
- mnt->auth_flavors[0] = RPC_AUTH_NULL;
- break;
- case Opt_sec_sys:
- mnt->flags &= ~NFS_MOUNT_SECFLAVOUR;
- mnt->auth_flavor_len = 0;
- mnt->auth_flavors[0] = RPC_AUTH_UNIX;
- break;
- case Opt_sec_krb5:
- mnt->flags |= NFS_MOUNT_SECFLAVOUR;
- mnt->auth_flavor_len = 1;
- mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5;
- break;
- case Opt_sec_krb5i:
- mnt->flags |= NFS_MOUNT_SECFLAVOUR;
- mnt->auth_flavor_len = 1;
- mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I;
- break;
- case Opt_sec_krb5p:
- mnt->flags |= NFS_MOUNT_SECFLAVOUR;
- mnt->auth_flavor_len = 1;
- mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P;
- break;
- case Opt_sec_lkey:
- mnt->flags |= NFS_MOUNT_SECFLAVOUR;
- mnt->auth_flavor_len = 1;
- mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY;
- break;
- case Opt_sec_lkeyi:
- mnt->flags |= NFS_MOUNT_SECFLAVOUR;
- mnt->auth_flavor_len = 1;
- mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI;
- break;
- case Opt_sec_lkeyp:
- mnt->flags |= NFS_MOUNT_SECFLAVOUR;
- mnt->auth_flavor_len = 1;
- mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP;
- break;
- case Opt_sec_spkm:
- mnt->flags |= NFS_MOUNT_SECFLAVOUR;
- mnt->auth_flavor_len = 1;
- mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM;
- break;
- case Opt_sec_spkmi:
- mnt->flags |= NFS_MOUNT_SECFLAVOUR;
- mnt->auth_flavor_len = 1;
- mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI;
- break;
- case Opt_sec_spkmp:
- mnt->flags |= NFS_MOUNT_SECFLAVOUR;
- mnt->auth_flavor_len = 1;
- mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP;
- break;
- default:
- goto out_unrec_sec;
+ if (!rc) {
+ errors++;
+ dfprintk(MOUNT, "NFS: unrecognized "
+ "security flavor\n");
}
break;
case Opt_proto:
@@ -1026,24 +1186,20 @@ static int nfs_parse_mount_options(char *raw,
case Opt_xprt_udp:
mnt->flags &= ~NFS_MOUNT_TCP;
mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
- mnt->timeo = 7;
- mnt->retrans = 5;
break;
case Opt_xprt_tcp:
mnt->flags |= NFS_MOUNT_TCP;
mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
- mnt->timeo = 600;
- mnt->retrans = 2;
break;
case Opt_xprt_rdma:
/* vector side protocols to TCP */
mnt->flags |= NFS_MOUNT_TCP;
mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
- mnt->timeo = 600;
- mnt->retrans = 2;
break;
default:
- goto out_unrec_xprt;
+ errors++;
+ dfprintk(MOUNT, "NFS: unrecognized "
+ "transport protocol\n");
}
break;
case Opt_mountproto:
@@ -1063,16 +1219,19 @@ static int nfs_parse_mount_options(char *raw,
break;
case Opt_xprt_rdma: /* not used for side protocols */
default:
- goto out_unrec_xprt;
+ errors++;
+ dfprintk(MOUNT, "NFS: unrecognized "
+ "transport protocol\n");
}
break;
case Opt_addr:
string = match_strdup(args);
if (string == NULL)
goto out_nomem;
- nfs_parse_server_address(string, (struct sockaddr *)
- &mnt->nfs_server.address,
- &mnt->nfs_server.addrlen);
+ nfs_parse_ip_address(string, strlen(string),
+ (struct sockaddr *)
+ &mnt->nfs_server.address,
+ &mnt->nfs_server.addrlen);
kfree(string);
break;
case Opt_clientaddr:
@@ -1093,24 +1252,39 @@ static int nfs_parse_mount_options(char *raw,
string = match_strdup(args);
if (string == NULL)
goto out_nomem;
- nfs_parse_server_address(string, (struct sockaddr *)
- &mnt->mount_server.address,
- &mnt->mount_server.addrlen);
+ nfs_parse_ip_address(string, strlen(string),
+ (struct sockaddr *)
+ &mnt->mount_server.address,
+ &mnt->mount_server.addrlen);
kfree(string);
break;
+ /*
+ * Special options
+ */
+ case Opt_sloppy:
+ sloppy = 1;
+ dfprintk(MOUNT, "NFS: relaxing parsing rules\n");
+ break;
case Opt_userspace:
case Opt_deprecated:
+ dfprintk(MOUNT, "NFS: ignoring mount option "
+ "'%s'\n", p);
break;
default:
- goto out_unknown;
+ errors++;
+ dfprintk(MOUNT, "NFS: unrecognized mount option "
+ "'%s'\n", p);
}
}
- nfs_set_port((struct sockaddr *)&mnt->nfs_server.address,
- mnt->nfs_server.port);
-
+ if (errors > 0) {
+ dfprintk(MOUNT, "NFS: parsing encountered %d error%s\n",
+ errors, (errors == 1 ? "" : "s"));
+ if (!sloppy)
+ return 0;
+ }
return 1;
out_nomem:
@@ -1120,21 +1294,6 @@ out_security_failure:
free_secdata(secdata);
printk(KERN_INFO "NFS: security options invalid: %d\n", rc);
return 0;
-out_unrec_vers:
- printk(KERN_INFO "NFS: unrecognized NFS version number\n");
- return 0;
-
-out_unrec_xprt:
- printk(KERN_INFO "NFS: unrecognized transport protocol\n");
- return 0;
-
-out_unrec_sec:
- printk(KERN_INFO "NFS: unrecognized security flavor\n");
- return 0;
-
-out_unknown:
- printk(KERN_INFO "NFS: unknown mount option: %s\n", p);
- return 0;
}
/*
@@ -1188,11 +1347,146 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
if (status == 0)
return 0;
- dfprintk(MOUNT, "NFS: unable to mount server %s, error %d",
+ dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n",
hostname, status);
return status;
}
+static int nfs_parse_simple_hostname(const char *dev_name,
+ char **hostname, size_t maxnamlen,
+ char **export_path, size_t maxpathlen)
+{
+ size_t len;
+ char *colon, *comma;
+
+ colon = strchr(dev_name, ':');
+ if (colon == NULL)
+ goto out_bad_devname;
+
+ len = colon - dev_name;
+ if (len > maxnamlen)
+ goto out_hostname;
+
+ /* N.B. caller will free nfs_server.hostname in all cases */
+ *hostname = kstrndup(dev_name, len, GFP_KERNEL);
+ if (!*hostname)
+ goto out_nomem;
+
+ /* kill possible hostname list: not supported */
+ comma = strchr(*hostname, ',');
+ if (comma != NULL) {
+ if (comma == *hostname)
+ goto out_bad_devname;
+ *comma = '\0';
+ }
+
+ colon++;
+ len = strlen(colon);
+ if (len > maxpathlen)
+ goto out_path;
+ *export_path = kstrndup(colon, len, GFP_KERNEL);
+ if (!*export_path)
+ goto out_nomem;
+
+ dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *export_path);
+ return 0;
+
+out_bad_devname:
+ dfprintk(MOUNT, "NFS: device name not in host:path format\n");
+ return -EINVAL;
+
+out_nomem:
+ dfprintk(MOUNT, "NFS: not enough memory to parse device name\n");
+ return -ENOMEM;
+
+out_hostname:
+ dfprintk(MOUNT, "NFS: server hostname too long\n");
+ return -ENAMETOOLONG;
+
+out_path:
+ dfprintk(MOUNT, "NFS: export pathname too long\n");
+ return -ENAMETOOLONG;
+}
+
+/*
+ * Hostname has square brackets around it because it contains one or
+ * more colons. We look for the first closing square bracket, and a
+ * colon must follow it.
+ */
+static int nfs_parse_protected_hostname(const char *dev_name,
+ char **hostname, size_t maxnamlen,
+ char **export_path, size_t maxpathlen)
+{
+ size_t len;
+ char *start, *end;
+
+ start = (char *)(dev_name + 1);
+
+ end = strchr(start, ']');
+ if (end == NULL)
+ goto out_bad_devname;
+ if (*(end + 1) != ':')
+ goto out_bad_devname;
+
+ len = end - start;
+ if (len > maxnamlen)
+ goto out_hostname;
+
+ /* N.B. caller will free nfs_server.hostname in all cases */
+ *hostname = kstrndup(start, len, GFP_KERNEL);
+ if (*hostname == NULL)
+ goto out_nomem;
+
+ end += 2;
+ len = strlen(end);
+ if (len > maxpathlen)
+ goto out_path;
+ *export_path = kstrndup(end, len, GFP_KERNEL);
+ if (!*export_path)
+ goto out_nomem;
+
+ return 0;
+
+out_bad_devname:
+ dfprintk(MOUNT, "NFS: device name not in host:path format\n");
+ return -EINVAL;
+
+out_nomem:
+ dfprintk(MOUNT, "NFS: not enough memory to parse device name\n");
+ return -ENOMEM;
+
+out_hostname:
+ dfprintk(MOUNT, "NFS: server hostname too long\n");
+ return -ENAMETOOLONG;
+
+out_path:
+ dfprintk(MOUNT, "NFS: export pathname too long\n");
+ return -ENAMETOOLONG;
+}
+
+/*
+ * Split "dev_name" into "hostname:export_path".
+ *
+ * The leftmost colon demarks the split between the server's hostname
+ * and the export path. If the hostname starts with a left square
+ * bracket, then it may contain colons.
+ *
+ * Note: caller frees hostname and export path, even on error.
+ */
+static int nfs_parse_devname(const char *dev_name,
+ char **hostname, size_t maxnamlen,
+ char **export_path, size_t maxpathlen)
+{
+ if (*dev_name == '[')
+ return nfs_parse_protected_hostname(dev_name,
+ hostname, maxnamlen,
+ export_path, maxpathlen);
+
+ return nfs_parse_simple_hostname(dev_name,
+ hostname, maxnamlen,
+ export_path, maxpathlen);
+}
+
/*
* Validate the NFS2/NFS3 mount data
* - fills in the mount root filehandle
@@ -1222,16 +1516,14 @@ static int nfs_validate_mount_data(void *options,
args->flags = (NFS_MOUNT_VER3 | NFS_MOUNT_TCP);
args->rsize = NFS_MAX_FILE_IO_SIZE;
args->wsize = NFS_MAX_FILE_IO_SIZE;
- args->timeo = 600;
- args->retrans = 2;
- args->acregmin = 3;
- args->acregmax = 60;
- args->acdirmin = 30;
- args->acdirmax = 60;
+ args->acregmin = NFS_DEF_ACREGMIN;
+ args->acregmax = NFS_DEF_ACREGMAX;
+ args->acdirmin = NFS_DEF_ACDIRMIN;
+ args->acdirmax = NFS_DEF_ACDIRMAX;
args->mount_server.port = 0; /* autobind unless user sets port */
- args->mount_server.protocol = XPRT_TRANSPORT_UDP;
args->nfs_server.port = 0; /* autobind unless user sets port */
args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+ args->auth_flavors[0] = RPC_AUTH_UNIX;
switch (data->version) {
case 1:
@@ -1289,7 +1581,9 @@ static int nfs_validate_mount_data(void *options,
args->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL);
args->namlen = data->namlen;
args->bsize = data->bsize;
- args->auth_flavors[0] = data->pseudoflavor;
+
+ if (data->flags & NFS_MOUNT_SECFLAVOUR)
+ args->auth_flavors[0] = data->pseudoflavor;
if (!args->nfs_server.hostname)
goto out_nomem;
@@ -1321,8 +1615,6 @@ static int nfs_validate_mount_data(void *options,
break;
default: {
- unsigned int len;
- char *c;
int status;
if (nfs_parse_mount_options((char *)options, args) == 0)
@@ -1332,21 +1624,22 @@ static int nfs_validate_mount_data(void *options,
&args->nfs_server.address))
goto out_no_address;
- c = strchr(dev_name, ':');
- if (c == NULL)
- return -EINVAL;
- len = c - dev_name;
- /* N.B. caller will free nfs_server.hostname in all cases */
- args->nfs_server.hostname = kstrndup(dev_name, len, GFP_KERNEL);
- if (!args->nfs_server.hostname)
- goto out_nomem;
+ nfs_set_port((struct sockaddr *)&args->nfs_server.address,
+ args->nfs_server.port);
- c++;
- if (strlen(c) > NFS_MAXPATHLEN)
- return -ENAMETOOLONG;
- args->nfs_server.export_path = c;
+ nfs_set_mount_transport_protocol(args);
+
+ status = nfs_parse_devname(dev_name,
+ &args->nfs_server.hostname,
+ PAGE_SIZE,
+ &args->nfs_server.export_path,
+ NFS_MAXPATHLEN);
+ if (!status)
+ status = nfs_try_mount(args, mntfh);
+
+ kfree(args->nfs_server.export_path);
+ args->nfs_server.export_path = NULL;
- status = nfs_try_mount(args, mntfh);
if (status)
return status;
@@ -1354,9 +1647,6 @@ static int nfs_validate_mount_data(void *options,
}
}
- if (!(args->flags & NFS_MOUNT_SECFLAVOUR))
- args->auth_flavors[0] = RPC_AUTH_UNIX;
-
#ifndef CONFIG_NFS_V3
if (args->flags & NFS_MOUNT_VER3)
goto out_v3_not_compiled;
@@ -1396,6 +1686,80 @@ out_invalid_fh:
return -EINVAL;
}
+static int
+nfs_compare_remount_data(struct nfs_server *nfss,
+ struct nfs_parsed_mount_data *data)
+{
+ if (data->flags != nfss->flags ||
+ data->rsize != nfss->rsize ||
+ data->wsize != nfss->wsize ||
+ data->retrans != nfss->client->cl_timeout->to_retries ||
+ data->auth_flavors[0] != nfss->client->cl_auth->au_flavor ||
+ data->acregmin != nfss->acregmin / HZ ||
+ data->acregmax != nfss->acregmax / HZ ||
+ data->acdirmin != nfss->acdirmin / HZ ||
+ data->acdirmax != nfss->acdirmax / HZ ||
+ data->timeo != (10U * nfss->client->cl_timeout->to_initval / HZ) ||
+ data->nfs_server.addrlen != nfss->nfs_client->cl_addrlen ||
+ memcmp(&data->nfs_server.address, &nfss->nfs_client->cl_addr,
+ data->nfs_server.addrlen) != 0)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int
+nfs_remount(struct super_block *sb, int *flags, char *raw_data)
+{
+ int error;
+ struct nfs_server *nfss = sb->s_fs_info;
+ struct nfs_parsed_mount_data *data;
+ struct nfs_mount_data *options = (struct nfs_mount_data *)raw_data;
+ struct nfs4_mount_data *options4 = (struct nfs4_mount_data *)raw_data;
+ u32 nfsvers = nfss->nfs_client->rpc_ops->version;
+
+ /*
+ * Userspace mount programs that send binary options generally send
+ * them populated with default values. We have no way to know which
+ * ones were explicitly specified. Fall back to legacy behavior and
+ * just return success.
+ */
+ if ((nfsvers == 4 && (!options4 || options4->version == 1)) ||
+ (nfsvers <= 3 && (!options || (options->version >= 1 &&
+ options->version <= 6))))
+ return 0;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (data == NULL)
+ return -ENOMEM;
+
+ /* fill out struct with values from existing mount */
+ data->flags = nfss->flags;
+ data->rsize = nfss->rsize;
+ data->wsize = nfss->wsize;
+ data->retrans = nfss->client->cl_timeout->to_retries;
+ data->auth_flavors[0] = nfss->client->cl_auth->au_flavor;
+ data->acregmin = nfss->acregmin / HZ;
+ data->acregmax = nfss->acregmax / HZ;
+ data->acdirmin = nfss->acdirmin / HZ;
+ data->acdirmax = nfss->acdirmax / HZ;
+ data->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ;
+ data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen;
+ memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr,
+ data->nfs_server.addrlen);
+
+ /* overwrite those values with any that were specified */
+ error = nfs_parse_mount_options((char *)options, data);
+ if (error < 0)
+ goto out;
+
+ /* compare new mount options with old ones */
+ error = nfs_compare_remount_data(nfss, data);
+out:
+ kfree(data);
+ return error;
+}
+
/*
* Initialise the common bits of the superblock
*/
@@ -1811,14 +2175,13 @@ static int nfs4_validate_mount_data(void *options,
args->rsize = NFS_MAX_FILE_IO_SIZE;
args->wsize = NFS_MAX_FILE_IO_SIZE;
- args->timeo = 600;
- args->retrans = 2;
- args->acregmin = 3;
- args->acregmax = 60;
- args->acdirmin = 30;
- args->acdirmax = 60;
+ args->acregmin = NFS_DEF_ACREGMIN;
+ args->acregmax = NFS_DEF_ACREGMAX;
+ args->acdirmin = NFS_DEF_ACDIRMIN;
+ args->acdirmax = NFS_DEF_ACDIRMAX;
args->nfs_server.port = NFS_PORT; /* 2049 unless user set port= */
- args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
+ args->auth_flavors[0] = RPC_AUTH_UNIX;
+ args->auth_flavor_len = 0;
switch (data->version) {
case 1:
@@ -1834,18 +2197,13 @@ static int nfs4_validate_mount_data(void *options,
&args->nfs_server.address))
goto out_no_address;
- switch (data->auth_flavourlen) {
- case 0:
- args->auth_flavors[0] = RPC_AUTH_UNIX;
- break;
- case 1:
+ if (data->auth_flavourlen) {
+ if (data->auth_flavourlen > 1)
+ goto out_inval_auth;
if (copy_from_user(&args->auth_flavors[0],
data->auth_flavours,
sizeof(args->auth_flavors[0])))
return -EFAULT;
- break;
- default:
- goto out_inval_auth;
}
c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN);
@@ -1879,10 +2237,11 @@ static int nfs4_validate_mount_data(void *options,
args->acdirmin = data->acdirmin;
args->acdirmax = data->acdirmax;
args->nfs_server.protocol = data->proto;
+ nfs_validate_transport_protocol(args);
break;
default: {
- unsigned int len;
+ int status;
if (nfs_parse_mount_options((char *)options, args) == 0)
return -EINVAL;
@@ -1891,44 +2250,25 @@ static int nfs4_validate_mount_data(void *options,
&args->nfs_server.address))
return -EINVAL;
- switch (args->auth_flavor_len) {
- case 0:
- args->auth_flavors[0] = RPC_AUTH_UNIX;
- break;
- case 1:
- break;
- default:
- goto out_inval_auth;
- }
+ nfs_set_port((struct sockaddr *)&args->nfs_server.address,
+ args->nfs_server.port);
- /*
- * Split "dev_name" into "hostname:mntpath".
- */
- c = strchr(dev_name, ':');
- if (c == NULL)
- return -EINVAL;
- /* while calculating len, pretend ':' is '\0' */
- len = c - dev_name;
- if (len > NFS4_MAXNAMLEN)
- return -ENAMETOOLONG;
- /* N.B. caller will free nfs_server.hostname in all cases */
- args->nfs_server.hostname = kstrndup(dev_name, len, GFP_KERNEL);
- if (!args->nfs_server.hostname)
- goto out_nomem;
-
- c++; /* step over the ':' */
- len = strlen(c);
- if (len > NFS4_MAXPATHLEN)
- return -ENAMETOOLONG;
- args->nfs_server.export_path = kstrndup(c, len, GFP_KERNEL);
- if (!args->nfs_server.export_path)
- goto out_nomem;
+ nfs_validate_transport_protocol(args);
- dprintk("NFS: MNTPATH: '%s'\n", args->nfs_server.export_path);
+ if (args->auth_flavor_len > 1)
+ goto out_inval_auth;
if (args->client_address == NULL)
goto out_no_client_address;
+ status = nfs_parse_devname(dev_name,
+ &args->nfs_server.hostname,
+ NFS4_MAXNAMLEN,
+ &args->nfs_server.export_path,
+ NFS4_MAXPATHLEN);
+ if (status < 0)
+ return status;
+
break;
}
}
@@ -1944,10 +2284,6 @@ out_inval_auth:
data->auth_flavourlen);
return -EINVAL;
-out_nomem:
- dfprintk(MOUNT, "NFS4: not enough memory to handle mount options\n");
- return -ENOMEM;
-
out_no_address:
dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n");
return -EINVAL;
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 3adf8b26646..f089e5839d7 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -95,10 +95,11 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata)
static void nfs_async_unlink_release(void *calldata)
{
struct nfs_unlinkdata *data = calldata;
+ struct super_block *sb = data->dir->i_sb;
nfs_dec_sillycount(data->dir);
- nfs_sb_deactive(NFS_SERVER(data->dir));
nfs_free_unlinkdata(data);
+ nfs_sb_deactive(NFS_SB(sb));
}
static const struct rpc_call_ops nfs_unlink_ops = {
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index f333848fd3b..3229e217c77 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -34,9 +34,6 @@
/*
* Local function declarations
*/
-static struct nfs_page * nfs_update_request(struct nfs_open_context*,
- struct page *,
- unsigned int, unsigned int);
static void nfs_pageio_init_write(struct nfs_pageio_descriptor *desc,
struct inode *inode, int ioflags);
static void nfs_redirty_request(struct nfs_page *req);
@@ -136,16 +133,21 @@ static struct nfs_page *nfs_page_find_request(struct page *page)
static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count)
{
struct inode *inode = page->mapping->host;
- loff_t end, i_size = i_size_read(inode);
- pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
+ loff_t end, i_size;
+ pgoff_t end_index;
+ spin_lock(&inode->i_lock);
+ i_size = i_size_read(inode);
+ end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
if (i_size > 0 && page->index < end_index)
- return;
+ goto out;
end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + ((loff_t)offset+count);
if (i_size >= end)
- return;
- nfs_inc_stats(inode, NFSIOS_EXTENDWRITE);
+ goto out;
i_size_write(inode, end);
+ nfs_inc_stats(inode, NFSIOS_EXTENDWRITE);
+out:
+ spin_unlock(&inode->i_lock);
}
/* A writeback failed: mark the page as bad, and invalidate the page cache */
@@ -169,29 +171,6 @@ static void nfs_mark_uptodate(struct page *page, unsigned int base, unsigned int
SetPageUptodate(page);
}
-static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
- unsigned int offset, unsigned int count)
-{
- struct nfs_page *req;
- int ret;
-
- for (;;) {
- req = nfs_update_request(ctx, page, offset, count);
- if (!IS_ERR(req))
- break;
- ret = PTR_ERR(req);
- if (ret != -EBUSY)
- return ret;
- ret = nfs_wb_page(page->mapping->host, page);
- if (ret != 0)
- return ret;
- }
- /* Update file length */
- nfs_grow_file(page, offset, count);
- nfs_clear_page_tag_locked(req);
- return 0;
-}
-
static int wb_priority(struct writeback_control *wbc)
{
if (wbc->for_reclaim)
@@ -268,12 +247,9 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
return ret;
spin_lock(&inode->i_lock);
}
- if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) {
- /* This request is marked for commit */
+ if (test_bit(PG_CLEAN, &req->wb_flags)) {
spin_unlock(&inode->i_lock);
- nfs_clear_page_tag_locked(req);
- nfs_pageio_complete(pgio);
- return 0;
+ BUG();
}
if (nfs_set_page_writeback(page) != 0) {
spin_unlock(&inode->i_lock);
@@ -355,11 +331,19 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
/*
* Insert a write request into an inode
*/
-static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
+static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
{
struct nfs_inode *nfsi = NFS_I(inode);
int error;
+ error = radix_tree_preload(GFP_NOFS);
+ if (error != 0)
+ goto out;
+
+ /* Lock the request! */
+ nfs_lock_request_dontget(req);
+
+ spin_lock(&inode->i_lock);
error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req);
BUG_ON(error);
if (!nfsi->npages) {
@@ -373,6 +357,10 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
kref_get(&req->wb_kref);
radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index,
NFS_PAGE_TAG_LOCKED);
+ spin_unlock(&inode->i_lock);
+ radix_tree_preload_end();
+out:
+ return error;
}
/*
@@ -405,19 +393,6 @@ nfs_mark_request_dirty(struct nfs_page *req)
__set_page_dirty_nobuffers(req->wb_page);
}
-/*
- * Check if a request is dirty
- */
-static inline int
-nfs_dirty_request(struct nfs_page *req)
-{
- struct page *page = req->wb_page;
-
- if (page == NULL || test_bit(PG_NEED_COMMIT, &req->wb_flags))
- return 0;
- return !PageWriteback(page);
-}
-
#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
/*
* Add a request to the inode's commit list.
@@ -430,7 +405,7 @@ nfs_mark_request_commit(struct nfs_page *req)
spin_lock(&inode->i_lock);
nfsi->ncommit++;
- set_bit(PG_NEED_COMMIT, &(req)->wb_flags);
+ set_bit(PG_CLEAN, &(req)->wb_flags);
radix_tree_tag_set(&nfsi->nfs_page_tree,
req->wb_index,
NFS_PAGE_TAG_COMMIT);
@@ -440,6 +415,19 @@ nfs_mark_request_commit(struct nfs_page *req)
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
}
+static int
+nfs_clear_request_commit(struct nfs_page *req)
+{
+ struct page *page = req->wb_page;
+
+ if (test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) {
+ dec_zone_page_state(page, NR_UNSTABLE_NFS);
+ dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE);
+ return 1;
+ }
+ return 0;
+}
+
static inline
int nfs_write_need_commit(struct nfs_write_data *data)
{
@@ -449,7 +437,7 @@ int nfs_write_need_commit(struct nfs_write_data *data)
static inline
int nfs_reschedule_unstable_write(struct nfs_page *req)
{
- if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) {
+ if (test_and_clear_bit(PG_NEED_COMMIT, &req->wb_flags)) {
nfs_mark_request_commit(req);
return 1;
}
@@ -465,6 +453,12 @@ nfs_mark_request_commit(struct nfs_page *req)
{
}
+static inline int
+nfs_clear_request_commit(struct nfs_page *req)
+{
+ return 0;
+}
+
static inline
int nfs_write_need_commit(struct nfs_write_data *data)
{
@@ -522,11 +516,8 @@ static void nfs_cancel_commit_list(struct list_head *head)
while(!list_empty(head)) {
req = nfs_list_entry(head->next);
- dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
- dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
- BDI_RECLAIMABLE);
nfs_list_remove_request(req);
- clear_bit(PG_NEED_COMMIT, &(req)->wb_flags);
+ nfs_clear_request_commit(req);
nfs_inode_remove_request(req);
nfs_unlock_request(req);
}
@@ -564,110 +555,124 @@ static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pg
#endif
/*
- * Try to update any existing write request, or create one if there is none.
- * In order to match, the request's credentials must match those of
- * the calling process.
+ * Search for an existing write request, and attempt to update
+ * it to reflect a new dirty region on a given page.
*
- * Note: Should always be called with the Page Lock held!
+ * If the attempt fails, then the existing request is flushed out
+ * to disk.
*/
-static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx,
- struct page *page, unsigned int offset, unsigned int bytes)
+static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
+ struct page *page,
+ unsigned int offset,
+ unsigned int bytes)
{
- struct address_space *mapping = page->mapping;
- struct inode *inode = mapping->host;
- struct nfs_page *req, *new = NULL;
- pgoff_t rqend, end;
+ struct nfs_page *req;
+ unsigned int rqend;
+ unsigned int end;
+ int error;
+
+ if (!PagePrivate(page))
+ return NULL;
end = offset + bytes;
+ spin_lock(&inode->i_lock);
for (;;) {
- /* Loop over all inode entries and see if we find
- * A request for the page we wish to update
+ req = nfs_page_find_request_locked(page);
+ if (req == NULL)
+ goto out_unlock;
+
+ rqend = req->wb_offset + req->wb_bytes;
+ /*
+ * Tell the caller to flush out the request if
+ * the offsets are non-contiguous.
+ * Note: nfs_flush_incompatible() will already
+ * have flushed out requests having wrong owners.
*/
- if (new) {
- if (radix_tree_preload(GFP_NOFS)) {
- nfs_release_request(new);
- return ERR_PTR(-ENOMEM);
- }
- }
+ if (offset > rqend
+ || end < req->wb_offset)
+ goto out_flushme;
- spin_lock(&inode->i_lock);
- req = nfs_page_find_request_locked(page);
- if (req) {
- if (!nfs_set_page_tag_locked(req)) {
- int error;
-
- spin_unlock(&inode->i_lock);
- error = nfs_wait_on_request(req);
- nfs_release_request(req);
- if (error < 0) {
- if (new) {
- radix_tree_preload_end();
- nfs_release_request(new);
- }
- return ERR_PTR(error);
- }
- continue;
- }
- spin_unlock(&inode->i_lock);
- if (new) {
- radix_tree_preload_end();
- nfs_release_request(new);
- }
+ if (nfs_set_page_tag_locked(req))
break;
- }
- if (new) {
- nfs_lock_request_dontget(new);
- nfs_inode_add_request(inode, new);
- spin_unlock(&inode->i_lock);
- radix_tree_preload_end();
- req = new;
- goto zero_page;
- }
+ /* The request is locked, so wait and then retry */
spin_unlock(&inode->i_lock);
-
- new = nfs_create_request(ctx, inode, page, offset, bytes);
- if (IS_ERR(new))
- return new;
+ error = nfs_wait_on_request(req);
+ nfs_release_request(req);
+ if (error != 0)
+ goto out_err;
+ spin_lock(&inode->i_lock);
}
- /* We have a request for our page.
- * If the creds don't match, or the
- * page addresses don't match,
- * tell the caller to wait on the conflicting
- * request.
- */
- rqend = req->wb_offset + req->wb_bytes;
- if (req->wb_context != ctx
- || req->wb_page != page
- || !nfs_dirty_request(req)
- || offset > rqend || end < req->wb_offset) {
- nfs_clear_page_tag_locked(req);
- return ERR_PTR(-EBUSY);
- }
+ if (nfs_clear_request_commit(req))
+ radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree,
+ req->wb_index, NFS_PAGE_TAG_COMMIT);
/* Okay, the request matches. Update the region */
if (offset < req->wb_offset) {
req->wb_offset = offset;
req->wb_pgbase = offset;
- req->wb_bytes = max(end, rqend) - req->wb_offset;
- goto zero_page;
}
-
if (end > rqend)
req->wb_bytes = end - req->wb_offset;
-
+ else
+ req->wb_bytes = rqend - req->wb_offset;
+out_unlock:
+ spin_unlock(&inode->i_lock);
return req;
-zero_page:
- /* If this page might potentially be marked as up to date,
- * then we need to zero any uninitalised data. */
- if (req->wb_pgbase == 0 && req->wb_bytes != PAGE_CACHE_SIZE
- && !PageUptodate(req->wb_page))
- zero_user_segment(req->wb_page, req->wb_bytes, PAGE_CACHE_SIZE);
+out_flushme:
+ spin_unlock(&inode->i_lock);
+ nfs_release_request(req);
+ error = nfs_wb_page(inode, page);
+out_err:
+ return ERR_PTR(error);
+}
+
+/*
+ * Try to update an existing write request, or create one if there is none.
+ *
+ * Note: Should always be called with the Page Lock held to prevent races
+ * if we have to add a new request. Also assumes that the caller has
+ * already called nfs_flush_incompatible() if necessary.
+ */
+static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
+ struct page *page, unsigned int offset, unsigned int bytes)
+{
+ struct inode *inode = page->mapping->host;
+ struct nfs_page *req;
+ int error;
+
+ req = nfs_try_to_update_request(inode, page, offset, bytes);
+ if (req != NULL)
+ goto out;
+ req = nfs_create_request(ctx, inode, page, offset, bytes);
+ if (IS_ERR(req))
+ goto out;
+ error = nfs_inode_add_request(inode, req);
+ if (error != 0) {
+ nfs_release_request(req);
+ req = ERR_PTR(error);
+ }
+out:
return req;
}
+static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
+ unsigned int offset, unsigned int count)
+{
+ struct nfs_page *req;
+
+ req = nfs_setup_write_request(ctx, page, offset, count);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+ /* Update file length */
+ nfs_grow_file(page, offset, count);
+ nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
+ nfs_clear_page_tag_locked(req);
+ return 0;
+}
+
int nfs_flush_incompatible(struct file *file, struct page *page)
{
struct nfs_open_context *ctx = nfs_file_open_context(file);
@@ -685,8 +690,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
req = nfs_page_find_request(page);
if (req == NULL)
return 0;
- do_flush = req->wb_page != page || req->wb_context != ctx
- || !nfs_dirty_request(req);
+ do_flush = req->wb_page != page || req->wb_context != ctx;
nfs_release_request(req);
if (!do_flush)
return 0;
@@ -721,10 +725,10 @@ int nfs_updatepage(struct file *file, struct page *page,
nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE);
- dprintk("NFS: nfs_updatepage(%s/%s %d@%Ld)\n",
+ dprintk("NFS: nfs_updatepage(%s/%s %d@%lld)\n",
file->f_path.dentry->d_parent->d_name.name,
file->f_path.dentry->d_name.name, count,
- (long long)(page_offset(page) +offset));
+ (long long)(page_offset(page) + offset));
/* If we're not using byte range locks, and we know the page
* is up to date, it may be more efficient to extend the write
@@ -744,7 +748,7 @@ int nfs_updatepage(struct file *file, struct page *page,
else
__set_page_dirty_nobuffers(page);
- dprintk("NFS: nfs_updatepage returns %d (isize %Ld)\n",
+ dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n",
status, (long long)i_size_read(inode));
return status;
}
@@ -752,12 +756,7 @@ int nfs_updatepage(struct file *file, struct page *page,
static void nfs_writepage_release(struct nfs_page *req)
{
- if (PageError(req->wb_page)) {
- nfs_end_page_writeback(req->wb_page);
- nfs_inode_remove_request(req);
- } else if (!nfs_reschedule_unstable_write(req)) {
- /* Set the PG_uptodate flag */
- nfs_mark_uptodate(req->wb_page, req->wb_pgbase, req->wb_bytes);
+ if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req)) {
nfs_end_page_writeback(req->wb_page);
nfs_inode_remove_request(req);
} else
@@ -834,7 +833,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
NFS_PROTO(inode)->write_setup(data, &msg);
dprintk("NFS: %5u initiated write call "
- "(req %s/%Ld, %u bytes @ offset %Lu)\n",
+ "(req %s/%lld, %u bytes @ offset %llu)\n",
data->task.tk_pid,
inode->i_sb->s_id,
(long long)NFS_FILEID(inode),
@@ -978,13 +977,13 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata)
{
struct nfs_write_data *data = calldata;
- struct nfs_page *req = data->req;
- dprintk("NFS: write (%s/%Ld %d@%Ld)",
- req->wb_context->path.dentry->d_inode->i_sb->s_id,
- (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
- req->wb_bytes,
- (long long)req_offset(req));
+ dprintk("NFS: %5u write(%s/%lld %d@%lld)",
+ task->tk_pid,
+ data->req->wb_context->path.dentry->d_inode->i_sb->s_id,
+ (long long)
+ NFS_FILEID(data->req->wb_context->path.dentry->d_inode),
+ data->req->wb_bytes, (long long)req_offset(data->req));
nfs_writeback_done(task, data);
}
@@ -1058,7 +1057,8 @@ static void nfs_writeback_release_full(void *calldata)
nfs_list_remove_request(req);
- dprintk("NFS: write (%s/%Ld %d@%Ld)",
+ dprintk("NFS: %5u write (%s/%lld %d@%lld)",
+ data->task.tk_pid,
req->wb_context->path.dentry->d_inode->i_sb->s_id,
(long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
req->wb_bytes,
@@ -1078,8 +1078,6 @@ static void nfs_writeback_release_full(void *calldata)
dprintk(" marked for commit\n");
goto next;
}
- /* Set the PG_uptodate flag? */
- nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
dprintk(" OK\n");
remove_request:
nfs_end_page_writeback(page);
@@ -1133,7 +1131,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
static unsigned long complain;
if (time_before(complain, jiffies)) {
- dprintk("NFS: faulty NFS server %s:"
+ dprintk("NFS: faulty NFS server %s:"
" (committed = %d) != (stable = %d)\n",
NFS_SERVER(data->inode)->nfs_client->cl_hostname,
resp->verf->committed, argp->stable);
@@ -1297,12 +1295,9 @@ static void nfs_commit_release(void *calldata)
while (!list_empty(&data->pages)) {
req = nfs_list_entry(data->pages.next);
nfs_list_remove_request(req);
- clear_bit(PG_NEED_COMMIT, &(req)->wb_flags);
- dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
- dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
- BDI_RECLAIMABLE);
+ nfs_clear_request_commit(req);
- dprintk("NFS: commit (%s/%Ld %d@%Ld)",
+ dprintk("NFS: commit (%s/%lld %d@%lld)",
req->wb_context->path.dentry->d_inode->i_sb->s_id,
(long long)NFS_FILEID(req->wb_context->path.dentry->d_inode),
req->wb_bytes,
@@ -1318,9 +1313,6 @@ static void nfs_commit_release(void *calldata)
* returned by the server against all stored verfs. */
if (!memcmp(req->wb_verf.verifier, data->verf.verifier, sizeof(data->verf.verifier))) {
/* We have a match */
- /* Set the PG_uptodate flag */
- nfs_mark_uptodate(req->wb_page, req->wb_pgbase,
- req->wb_bytes);
nfs_inode_remove_request(req);
dprintk(" OK\n");
goto next;
@@ -1479,7 +1471,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
req = nfs_page_find_request(page);
if (req == NULL)
goto out;
- if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) {
+ if (test_bit(PG_CLEAN, &req->wb_flags)) {
nfs_release_request(req);
break;
}
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 33bfcf09db4..9dc036f1835 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1023,7 +1023,7 @@ exp_export(struct nfsctl_export *nxp)
/* Look up the dentry */
err = path_lookup(nxp->ex_path, 0, &nd);
if (err)
- goto out_unlock;
+ goto out_put_clp;
err = -EINVAL;
exp = exp_get_by_name(clp, nd.path.mnt, nd.path.dentry, NULL);
@@ -1090,9 +1090,9 @@ finish:
exp_put(exp);
if (fsid_key && !IS_ERR(fsid_key))
cache_put(&fsid_key->h, &svc_expkey_cache);
- if (clp)
- auth_domain_put(clp);
path_put(&nd.path);
+out_put_clp:
+ auth_domain_put(clp);
out_unlock:
exp_writeunlock();
out:
diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
index 9e4a568a501..b2786a5f9af 100644
--- a/fs/nfsd/lockd.c
+++ b/fs/nfsd/lockd.c
@@ -19,6 +19,13 @@
#define NFSDDBG_FACILITY NFSDDBG_LOCKD
+#ifdef CONFIG_LOCKD_V4
+#define nlm_stale_fh nlm4_stale_fh
+#define nlm_failed nlm4_failed
+#else
+#define nlm_stale_fh nlm_lck_denied_nolocks
+#define nlm_failed nlm_lck_denied_nolocks
+#endif
/*
* Note: we hold the dentry use count while the file is open.
*/
@@ -35,7 +42,7 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp)
fh.fh_export = NULL;
exp_readlock();
- nfserr = nfsd_open(rqstp, &fh, S_IFREG, MAY_LOCK, filp);
+ nfserr = nfsd_open(rqstp, &fh, S_IFREG, NFSD_MAY_LOCK, filp);
fh_put(&fh);
rqstp->rq_client = NULL;
exp_readunlock();
@@ -47,12 +54,10 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp)
return 0;
case nfserr_dropit:
return nlm_drop_reply;
-#ifdef CONFIG_LOCKD_V4
case nfserr_stale:
- return nlm4_stale_fh;
-#endif
+ return nlm_stale_fh;
default:
- return nlm_lck_denied;
+ return nlm_failed;
}
}
@@ -65,7 +70,6 @@ nlm_fclose(struct file *filp)
static struct nlmsvc_binding nfsd_nlm_ops = {
.fopen = nlm_fopen, /* open file for locking */
.fclose = nlm_fclose, /* close file */
- .get_grace_period = get_nfs4_grace_period,
};
void
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 1c3b7654e96..4e3219e8411 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -40,7 +40,8 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp,
dprintk("nfsd: GETACL(2acl) %s\n", SVCFH_fmt(&argp->fh));
fh = fh_copy(&resp->fh, &argp->fh);
- if ((nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP)))
+ nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
+ if (nfserr)
RETURN_STATUS(nfserr);
if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
@@ -107,7 +108,7 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst * rqstp,
dprintk("nfsd: SETACL(2acl) %s\n", SVCFH_fmt(&argp->fh));
fh = fh_copy(&resp->fh, &argp->fh);
- nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_SATTR);
+ nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR);
if (!nfserr) {
nfserr = nfserrno( nfsd_set_posix_acl(
@@ -134,7 +135,7 @@ static __be32 nfsacld_proc_getattr(struct svc_rqst * rqstp,
dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh));
fh_copy(&resp->fh, &argp->fh);
- return fh_verify(rqstp, &resp->fh, 0, MAY_NOP);
+ return fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
}
/*
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index b647f2f872d..9981dbb377a 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -36,7 +36,8 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp,
__be32 nfserr = 0;
fh = fh_copy(&resp->fh, &argp->fh);
- if ((nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP)))
+ nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP);
+ if (nfserr)
RETURN_STATUS(nfserr);
if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
@@ -101,7 +102,7 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst * rqstp,
__be32 nfserr = 0;
fh = fh_copy(&resp->fh, &argp->fh);
- nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_SATTR);
+ nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR);
if (!nfserr) {
nfserr = nfserrno( nfsd_set_posix_acl(
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index c721a1e6e9d..9dbd2eb9128 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -63,7 +63,8 @@ nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp,
SVCFH_fmt(&argp->fh));
fh_copy(&resp->fh, &argp->fh);
- nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP);
+ nfserr = fh_verify(rqstp, &resp->fh, 0,
+ NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
if (nfserr)
RETURN_STATUS(nfserr);
@@ -242,7 +243,7 @@ nfsd3_proc_create(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
attr = &argp->attrs;
/* Get the directory inode */
- nfserr = fh_verify(rqstp, dirfhp, S_IFDIR, MAY_CREATE);
+ nfserr = fh_verify(rqstp, dirfhp, S_IFDIR, NFSD_MAY_CREATE);
if (nfserr)
RETURN_STATUS(nfserr);
@@ -530,7 +531,7 @@ nfsd3_proc_fsstat(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
dprintk("nfsd: FSSTAT(3) %s\n",
SVCFH_fmt(&argp->fh));
- nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats);
+ nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats, 0);
fh_put(&argp->fh);
RETURN_STATUS(nfserr);
}
@@ -558,7 +559,8 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
resp->f_maxfilesize = ~(u32) 0;
resp->f_properties = NFS3_FSF_DEFAULT;
- nfserr = fh_verify(rqstp, &argp->fh, 0, MAY_NOP);
+ nfserr = fh_verify(rqstp, &argp->fh, 0,
+ NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
/* Check special features of the file system. May request
* different read/write sizes for file systems known to have
@@ -597,7 +599,7 @@ nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
resp->p_case_insensitive = 0;
resp->p_case_preserving = 1;
- nfserr = fh_verify(rqstp, &argp->fh, 0, MAY_NOP);
+ nfserr = fh_verify(rqstp, &argp->fh, 0, NFSD_MAY_NOP);
if (nfserr == 0) {
struct super_block *sb = argp->fh.fh_dentry->d_inode->i_sb;
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index b6ed38380ab..54b8b4140c8 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -443,7 +443,7 @@ init_state(struct posix_acl_state *state, int cnt)
* enough space for either:
*/
alloc = sizeof(struct posix_ace_state_array)
- + cnt*sizeof(struct posix_ace_state);
+ + cnt*sizeof(struct posix_user_ace_state);
state->users = kzalloc(alloc, GFP_KERNEL);
if (!state->users)
return -ENOMEM;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 4d4760e687c..094747a1227 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -225,7 +225,8 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len);
WRITE32(OP_CB_RECALL);
- WRITEMEM(&cb_rec->cbr_stateid, sizeof(stateid_t));
+ WRITE32(cb_rec->cbr_stateid.si_generation);
+ WRITEMEM(&cb_rec->cbr_stateid.si_opaque, sizeof(stateid_opaque_t));
WRITE32(cb_rec->cbr_trunc);
WRITE32(len);
WRITEMEM(cb_rec->cbr_fhval, len);
@@ -379,9 +380,10 @@ static int do_probe_callback(void *data)
.addrsize = sizeof(addr),
.timeout = &timeparms,
.program = &cb_program,
+ .prognumber = cb->cb_prog,
.version = nfs_cb_version[1]->number,
.authflavor = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */
- .flags = (RPC_CLNT_CREATE_NOPING),
+ .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
};
struct rpc_message msg = {
.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
@@ -396,9 +398,6 @@ static int do_probe_callback(void *data)
addr.sin_port = htons(cb->cb_port);
addr.sin_addr.s_addr = htonl(cb->cb_addr);
- /* Initialize rpc_stat */
- memset(args.program->stats, 0, sizeof(struct rpc_stat));
-
/* Create RPC client */
client = rpc_create(&args);
if (IS_ERR(client)) {
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index c309c881bd4..669461e291a 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -71,11 +71,11 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs
return nfserr_inval;
if (open->op_share_access & NFS4_SHARE_ACCESS_READ)
- accmode |= MAY_READ;
+ accmode |= NFSD_MAY_READ;
if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
- accmode |= (MAY_WRITE | MAY_TRUNC);
+ accmode |= (NFSD_MAY_WRITE | NFSD_MAY_TRUNC);
if (open->op_share_deny & NFS4_SHARE_DENY_WRITE)
- accmode |= MAY_WRITE;
+ accmode |= NFSD_MAY_WRITE;
status = fh_verify(rqstp, current_fh, S_IFREG, accmode);
@@ -126,7 +126,8 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
&resfh.fh_handle.fh_base, resfh.fh_handle.fh_size);
if (!created)
- status = do_open_permission(rqstp, current_fh, open, MAY_NOP);
+ status = do_open_permission(rqstp, current_fh, open,
+ NFSD_MAY_NOP);
out:
fh_put(&resfh);
@@ -157,7 +158,8 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) &&
(open->op_iattr.ia_size == 0);
- status = do_open_permission(rqstp, current_fh, open, MAY_OWNER_OVERRIDE);
+ status = do_open_permission(rqstp, current_fh, open,
+ NFSD_MAY_OWNER_OVERRIDE);
return status;
}
@@ -186,7 +188,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
cstate->current_fh.fh_handle.fh_size = rp->rp_openfh_len;
memcpy(&cstate->current_fh.fh_handle.fh_base, rp->rp_openfh,
rp->rp_openfh_len);
- status = fh_verify(rqstp, &cstate->current_fh, 0, MAY_NOP);
+ status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
if (status)
dprintk("nfsd4_open: replay failed"
" restoring previous filehandle\n");
@@ -199,10 +201,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
/* Openowner is now set, so sequence id will get bumped. Now we need
* these checks before we do any creates: */
status = nfserr_grace;
- if (nfs4_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
+ if (locks_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
goto out;
status = nfserr_no_grace;
- if (!nfs4_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
+ if (!locks_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
goto out;
switch (open->op_claim_type) {
@@ -285,7 +287,7 @@ nfsd4_putfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
cstate->current_fh.fh_handle.fh_size = putfh->pf_fhlen;
memcpy(&cstate->current_fh.fh_handle.fh_base, putfh->pf_fhval,
putfh->pf_fhlen);
- return fh_verify(rqstp, &cstate->current_fh, 0, MAY_NOP);
+ return fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
}
static __be32
@@ -363,7 +365,8 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
fh_init(&resfh, NFS4_FHSIZE);
- status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, MAY_CREATE);
+ status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR,
+ NFSD_MAY_CREATE);
if (status == nfserr_symlink)
status = nfserr_notdir;
if (status)
@@ -445,7 +448,7 @@ nfsd4_getattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
{
__be32 status;
- status = fh_verify(rqstp, &cstate->current_fh, 0, MAY_NOP);
+ status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
if (status)
return status;
@@ -572,7 +575,7 @@ nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
{
__be32 status;
- if (nfs4_in_grace())
+ if (locks_in_grace())
return nfserr_grace;
status = nfsd_unlink(rqstp, &cstate->current_fh, 0,
remove->rm_name, remove->rm_namelen);
@@ -593,7 +596,7 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (!cstate->save_fh.fh_dentry)
return status;
- if (nfs4_in_grace() && !(cstate->save_fh.fh_export->ex_flags
+ if (locks_in_grace() && !(cstate->save_fh.fh_export->ex_flags
& NFSEXP_NOSUBTREECHECK))
return nfserr_grace;
status = nfsd_rename(rqstp, &cstate->save_fh, rename->rn_sname,
@@ -730,7 +733,7 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
int count;
__be32 status;
- status = fh_verify(rqstp, &cstate->current_fh, 0, MAY_NOP);
+ status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
if (status)
return status;
@@ -843,10 +846,13 @@ struct nfsd4_operation {
#define ALLOWED_WITHOUT_FH 1
/* GETATTR and ops not listed as returning NFS4ERR_MOVED: */
#define ALLOWED_ON_ABSENT_FS 2
+ char *op_name;
};
static struct nfsd4_operation nfsd4_ops[];
+static const char *nfsd4_op_name(unsigned opnum);
+
/*
* COMPOUND call.
*/
@@ -861,11 +867,6 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
int slack_bytes;
__be32 status;
- status = nfserr_resource;
- cstate = cstate_alloc();
- if (cstate == NULL)
- goto out;
-
resp->xbuf = &rqstp->rq_res;
resp->p = rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len;
resp->tagp = resp->p;
@@ -884,11 +885,18 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
if (args->minorversion > NFSD_SUPPORTED_MINOR_VERSION)
goto out;
+ status = nfserr_resource;
+ cstate = cstate_alloc();
+ if (cstate == NULL)
+ goto out;
+
status = nfs_ok;
while (!status && resp->opcnt < args->opcnt) {
op = &args->ops[resp->opcnt++];
- dprintk("nfsv4 compound op #%d: %d\n", resp->opcnt, op->opnum);
+ dprintk("nfsv4 compound op #%d/%d: %d (%s)\n",
+ resp->opcnt, args->opcnt, op->opnum,
+ nfsd4_op_name(op->opnum));
/*
* The XDR decode routines may have pre-set op->status;
@@ -949,129 +957,172 @@ encode_op:
nfsd4_increment_op_stats(op->opnum);
}
+ cstate_free(cstate);
out:
nfsd4_release_compoundargs(args);
- cstate_free(cstate);
+ dprintk("nfsv4 compound returned %d\n", ntohl(status));
return status;
}
static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = {
[OP_ACCESS] = {
.op_func = (nfsd4op_func)nfsd4_access,
+ .op_name = "OP_ACCESS",
},
[OP_CLOSE] = {
.op_func = (nfsd4op_func)nfsd4_close,
+ .op_name = "OP_CLOSE",
},
[OP_COMMIT] = {
.op_func = (nfsd4op_func)nfsd4_commit,
+ .op_name = "OP_COMMIT",
},
[OP_CREATE] = {
.op_func = (nfsd4op_func)nfsd4_create,
+ .op_name = "OP_CREATE",
},
[OP_DELEGRETURN] = {
.op_func = (nfsd4op_func)nfsd4_delegreturn,
+ .op_name = "OP_DELEGRETURN",
},
[OP_GETATTR] = {
.op_func = (nfsd4op_func)nfsd4_getattr,
.op_flags = ALLOWED_ON_ABSENT_FS,
+ .op_name = "OP_GETATTR",
},
[OP_GETFH] = {
.op_func = (nfsd4op_func)nfsd4_getfh,
+ .op_name = "OP_GETFH",
},
[OP_LINK] = {
.op_func = (nfsd4op_func)nfsd4_link,
+ .op_name = "OP_LINK",
},
[OP_LOCK] = {
.op_func = (nfsd4op_func)nfsd4_lock,
+ .op_name = "OP_LOCK",
},
[OP_LOCKT] = {
.op_func = (nfsd4op_func)nfsd4_lockt,
+ .op_name = "OP_LOCKT",
},
[OP_LOCKU] = {
.op_func = (nfsd4op_func)nfsd4_locku,
+ .op_name = "OP_LOCKU",
},
[OP_LOOKUP] = {
.op_func = (nfsd4op_func)nfsd4_lookup,
+ .op_name = "OP_LOOKUP",
},
[OP_LOOKUPP] = {
.op_func = (nfsd4op_func)nfsd4_lookupp,
+ .op_name = "OP_LOOKUPP",
},
[OP_NVERIFY] = {
.op_func = (nfsd4op_func)nfsd4_nverify,
+ .op_name = "OP_NVERIFY",
},
[OP_OPEN] = {
.op_func = (nfsd4op_func)nfsd4_open,
+ .op_name = "OP_OPEN",
},
[OP_OPEN_CONFIRM] = {
.op_func = (nfsd4op_func)nfsd4_open_confirm,
+ .op_name = "OP_OPEN_CONFIRM",
},
[OP_OPEN_DOWNGRADE] = {
.op_func = (nfsd4op_func)nfsd4_open_downgrade,
+ .op_name = "OP_OPEN_DOWNGRADE",
},
[OP_PUTFH] = {
.op_func = (nfsd4op_func)nfsd4_putfh,
.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+ .op_name = "OP_PUTFH",
},
[OP_PUTPUBFH] = {
- /* unsupported; just for future reference: */
+ /* unsupported, just for future reference: */
.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+ .op_name = "OP_PUTPUBFH",
},
[OP_PUTROOTFH] = {
.op_func = (nfsd4op_func)nfsd4_putrootfh,
.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+ .op_name = "OP_PUTROOTFH",
},
[OP_READ] = {
.op_func = (nfsd4op_func)nfsd4_read,
+ .op_name = "OP_READ",
},
[OP_READDIR] = {
.op_func = (nfsd4op_func)nfsd4_readdir,
+ .op_name = "OP_READDIR",
},
[OP_READLINK] = {
.op_func = (nfsd4op_func)nfsd4_readlink,
+ .op_name = "OP_READLINK",
},
[OP_REMOVE] = {
.op_func = (nfsd4op_func)nfsd4_remove,
+ .op_name = "OP_REMOVE",
},
[OP_RENAME] = {
+ .op_name = "OP_RENAME",
.op_func = (nfsd4op_func)nfsd4_rename,
},
[OP_RENEW] = {
.op_func = (nfsd4op_func)nfsd4_renew,
.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+ .op_name = "OP_RENEW",
},
[OP_RESTOREFH] = {
.op_func = (nfsd4op_func)nfsd4_restorefh,
.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+ .op_name = "OP_RESTOREFH",
},
[OP_SAVEFH] = {
.op_func = (nfsd4op_func)nfsd4_savefh,
+ .op_name = "OP_SAVEFH",
},
[OP_SECINFO] = {
.op_func = (nfsd4op_func)nfsd4_secinfo,
+ .op_name = "OP_SECINFO",
},
[OP_SETATTR] = {
.op_func = (nfsd4op_func)nfsd4_setattr,
+ .op_name = "OP_SETATTR",
},
[OP_SETCLIENTID] = {
.op_func = (nfsd4op_func)nfsd4_setclientid,
.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+ .op_name = "OP_SETCLIENTID",
},
[OP_SETCLIENTID_CONFIRM] = {
.op_func = (nfsd4op_func)nfsd4_setclientid_confirm,
.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+ .op_name = "OP_SETCLIENTID_CONFIRM",
},
[OP_VERIFY] = {
.op_func = (nfsd4op_func)nfsd4_verify,
+ .op_name = "OP_VERIFY",
},
[OP_WRITE] = {
.op_func = (nfsd4op_func)nfsd4_write,
+ .op_name = "OP_WRITE",
},
[OP_RELEASE_LOCKOWNER] = {
.op_func = (nfsd4op_func)nfsd4_release_lockowner,
.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
+ .op_name = "OP_RELEASE_LOCKOWNER",
},
};
+static const char *nfsd4_op_name(unsigned opnum)
+{
+ if (opnum < ARRAY_SIZE(nfsd4_ops))
+ return nfsd4_ops[opnum].op_name;
+ return "unknown_operation";
+}
+
#define nfs4svc_decode_voidargs NULL
#define nfs4svc_release_void NULL
#define nfsd4_voidres nfsd4_voidargs
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 8799b870818..0cc7ff5d5ab 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -61,7 +61,6 @@
static time_t lease_time = 90; /* default lease time */
static time_t user_lease_time = 90;
static time_t boot_time;
-static int in_grace = 1;
static u32 current_ownerid = 1;
static u32 current_fileid = 1;
static u32 current_delegid = 1;
@@ -1173,6 +1172,24 @@ static inline int deny_valid(u32 x)
return x <= NFS4_SHARE_DENY_BOTH;
}
+/*
+ * We store the NONE, READ, WRITE, and BOTH bits separately in the
+ * st_{access,deny}_bmap field of the stateid, in order to track not
+ * only what share bits are currently in force, but also what
+ * combinations of share bits previous opens have used. This allows us
+ * to enforce the recommendation of rfc 3530 14.2.19 that the server
+ * return an error if the client attempt to downgrade to a combination
+ * of share bits not explicable by closing some of its previous opens.
+ *
+ * XXX: This enforcement is actually incomplete, since we don't keep
+ * track of access/deny bit combinations; so, e.g., we allow:
+ *
+ * OPEN allow read, deny write
+ * OPEN allow both, deny none
+ * DOWNGRADE allow read, deny none
+ *
+ * which we should reject.
+ */
static void
set_access(unsigned int *access, unsigned long bmap) {
int i;
@@ -1570,6 +1587,10 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct svc_fh *cur_fh, struct nfs4_sta
int err = get_write_access(inode);
if (err)
return nfserrno(err);
+ err = mnt_want_write(cur_fh->fh_export->ex_path.mnt);
+ if (err)
+ return nfserrno(err);
+ file_take_write(filp);
}
status = nfsd4_truncate(rqstp, cur_fh, open);
if (status) {
@@ -1579,8 +1600,8 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct svc_fh *cur_fh, struct nfs4_sta
}
/* remember the open */
filp->f_mode |= open->op_share_access;
- set_bit(open->op_share_access, &stp->st_access_bmap);
- set_bit(open->op_share_deny, &stp->st_deny_bmap);
+ __set_bit(open->op_share_access, &stp->st_access_bmap);
+ __set_bit(open->op_share_deny, &stp->st_deny_bmap);
return nfs_ok;
}
@@ -1618,7 +1639,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
case NFS4_OPEN_CLAIM_NULL:
/* Let's not give out any delegations till everyone's
* had the chance to reclaim theirs.... */
- if (nfs4_in_grace())
+ if (locks_in_grace())
goto out;
if (!atomic_read(&cb->cb_set) || !sop->so_confirmed)
goto out;
@@ -1722,9 +1743,9 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
/* Stateid was not found, this is a new OPEN */
int flags = 0;
if (open->op_share_access & NFS4_SHARE_ACCESS_READ)
- flags |= MAY_READ;
+ flags |= NFSD_MAY_READ;
if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
- flags |= MAY_WRITE;
+ flags |= NFSD_MAY_WRITE;
status = nfs4_new_open(rqstp, &stp, dp, current_fh, flags);
if (status)
goto out;
@@ -1794,12 +1815,15 @@ out:
return status;
}
+struct lock_manager nfsd4_manager = {
+};
+
static void
-end_grace(void)
+nfsd4_end_grace(void)
{
dprintk("NFSD: end of grace period\n");
nfsd4_recdir_purge_old();
- in_grace = 0;
+ locks_end_grace(&nfsd4_manager);
}
static time_t
@@ -1816,8 +1840,8 @@ nfs4_laundromat(void)
nfs4_lock_state();
dprintk("NFSD: laundromat service - starting\n");
- if (in_grace)
- end_grace();
+ if (locks_in_grace())
+ nfsd4_end_grace();
list_for_each_safe(pos, next, &client_lru) {
clp = list_entry(pos, struct nfs4_client, cl_lru);
if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) {
@@ -1952,7 +1976,7 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
return nfserr_bad_stateid;
else if (ONE_STATEID(stateid) && (flags & RD_STATE))
return nfs_ok;
- else if (nfs4_in_grace()) {
+ else if (locks_in_grace()) {
/* Answer in remaining cases depends on existance of
* conflicting state; so we must wait out the grace period. */
return nfserr_grace;
@@ -1971,7 +1995,7 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
static inline int
io_during_grace_disallowed(struct inode *inode, int flags)
{
- return nfs4_in_grace() && (flags & (RD_STATE | WR_STATE))
+ return locks_in_grace() && (flags & (RD_STATE | WR_STATE))
&& mandatory_lock(inode);
}
@@ -2610,7 +2634,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
return nfserr_inval;
if ((status = fh_verify(rqstp, &cstate->current_fh,
- S_IFREG, MAY_LOCK))) {
+ S_IFREG, NFSD_MAY_LOCK))) {
dprintk("NFSD: nfsd4_lock: permission denied!\n");
return status;
}
@@ -2671,10 +2695,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
filp = lock_stp->st_vfs_file;
status = nfserr_grace;
- if (nfs4_in_grace() && !lock->lk_reclaim)
+ if (locks_in_grace() && !lock->lk_reclaim)
goto out;
status = nfserr_no_grace;
- if (!nfs4_in_grace() && lock->lk_reclaim)
+ if (!locks_in_grace() && lock->lk_reclaim)
goto out;
locks_init_lock(&file_lock);
@@ -2757,7 +2781,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
int error;
__be32 status;
- if (nfs4_in_grace())
+ if (locks_in_grace())
return nfserr_grace;
if (check_lock_length(lockt->lt_offset, lockt->lt_length))
@@ -3170,9 +3194,9 @@ __nfs4_state_start(void)
unsigned long grace_time;
boot_time = get_seconds();
- grace_time = get_nfs_grace_period();
+ grace_time = get_nfs4_grace_period();
lease_time = user_lease_time;
- in_grace = 1;
+ locks_start_grace(&nfsd4_manager);
printk(KERN_INFO "NFSD: starting %ld-second grace period\n",
grace_time/HZ);
laundry_wq = create_singlethread_workqueue("nfsd4");
@@ -3191,12 +3215,6 @@ nfs4_state_start(void)
return;
}
-int
-nfs4_in_grace(void)
-{
- return in_grace;
-}
-
time_t
nfs4_lease_time(void)
{
@@ -3249,12 +3267,14 @@ nfs4_state_shutdown(void)
nfs4_unlock_state();
}
+/*
+ * user_recovery_dirname is protected by the nfsd_mutex since it's only
+ * accessed when nfsd is starting.
+ */
static void
nfs4_set_recdir(char *recdir)
{
- nfs4_lock_state();
strcpy(user_recovery_dirname, recdir);
- nfs4_unlock_state();
}
/*
@@ -3278,6 +3298,12 @@ nfs4_reset_recoverydir(char *recdir)
return status;
}
+char *
+nfs4_recoverydir(void)
+{
+ return user_recovery_dirname;
+}
+
/*
* Called when leasetime is changed.
*
@@ -3286,11 +3312,12 @@ nfs4_reset_recoverydir(char *recdir)
* we start to register any changes in lease time. If the administrator
* really wants to change the lease time *now*, they can go ahead and bring
* nfsd down and then back up again after changing the lease time.
+ *
+ * user_lease_time is protected by nfsd_mutex since it's only really accessed
+ * when nfsd is starting
*/
void
nfs4_reset_lease(time_t leasetime)
{
- lock_kernel();
user_lease_time = leasetime;
- unlock_kernel();
}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index c513bbdf2d3..afcdf4b7684 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -413,6 +413,18 @@ out_nfserr:
}
static __be32
+nfsd4_decode_stateid(struct nfsd4_compoundargs *argp, stateid_t *sid)
+{
+ DECODE_HEAD;
+
+ READ_BUF(sizeof(stateid_t));
+ READ32(sid->si_generation);
+ COPYMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
+
+ DECODE_TAIL;
+}
+
+static __be32
nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access)
{
DECODE_HEAD;
@@ -429,10 +441,9 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
DECODE_HEAD;
close->cl_stateowner = NULL;
- READ_BUF(4 + sizeof(stateid_t));
+ READ_BUF(4);
READ32(close->cl_seqid);
- READ32(close->cl_stateid.si_generation);
- COPYMEM(&close->cl_stateid.si_opaque, sizeof(stateid_opaque_t));
+ return nfsd4_decode_stateid(argp, &close->cl_stateid);
DECODE_TAIL;
}
@@ -493,13 +504,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
static inline __be32
nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr)
{
- DECODE_HEAD;
-
- READ_BUF(sizeof(stateid_t));
- READ32(dr->dr_stateid.si_generation);
- COPYMEM(&dr->dr_stateid.si_opaque, sizeof(stateid_opaque_t));
-
- DECODE_TAIL;
+ return nfsd4_decode_stateid(argp, &dr->dr_stateid);
}
static inline __be32
@@ -542,20 +547,22 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
READ32(lock->lk_is_new);
if (lock->lk_is_new) {
- READ_BUF(36);
+ READ_BUF(4);
READ32(lock->lk_new_open_seqid);
- READ32(lock->lk_new_open_stateid.si_generation);
-
- COPYMEM(&lock->lk_new_open_stateid.si_opaque, sizeof(stateid_opaque_t));
+ status = nfsd4_decode_stateid(argp, &lock->lk_new_open_stateid);
+ if (status)
+ return status;
+ READ_BUF(8 + sizeof(clientid_t));
READ32(lock->lk_new_lock_seqid);
COPYMEM(&lock->lk_new_clientid, sizeof(clientid_t));
READ32(lock->lk_new_owner.len);
READ_BUF(lock->lk_new_owner.len);
READMEM(lock->lk_new_owner.data, lock->lk_new_owner.len);
} else {
- READ_BUF(20);
- READ32(lock->lk_old_lock_stateid.si_generation);
- COPYMEM(&lock->lk_old_lock_stateid.si_opaque, sizeof(stateid_opaque_t));
+ status = nfsd4_decode_stateid(argp, &lock->lk_old_lock_stateid);
+ if (status)
+ return status;
+ READ_BUF(4);
READ32(lock->lk_old_lock_seqid);
}
@@ -587,13 +594,15 @@ nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku)
DECODE_HEAD;
locku->lu_stateowner = NULL;
- READ_BUF(24 + sizeof(stateid_t));
+ READ_BUF(8);
READ32(locku->lu_type);
if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT))
goto xdr_error;
READ32(locku->lu_seqid);
- READ32(locku->lu_stateid.si_generation);
- COPYMEM(&locku->lu_stateid.si_opaque, sizeof(stateid_opaque_t));
+ status = nfsd4_decode_stateid(argp, &locku->lu_stateid);
+ if (status)
+ return status;
+ READ_BUF(16);
READ64(locku->lu_offset);
READ64(locku->lu_length);
@@ -678,8 +687,10 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
READ32(open->op_delegate_type);
break;
case NFS4_OPEN_CLAIM_DELEGATE_CUR:
- READ_BUF(sizeof(stateid_t) + 4);
- COPYMEM(&open->op_delegate_stateid, sizeof(stateid_t));
+ status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid);
+ if (status)
+ return status;
+ READ_BUF(4);
READ32(open->op_fname.len);
READ_BUF(open->op_fname.len);
SAVEMEM(open->op_fname.data, open->op_fname.len);
@@ -699,9 +710,10 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con
DECODE_HEAD;
open_conf->oc_stateowner = NULL;
- READ_BUF(4 + sizeof(stateid_t));
- READ32(open_conf->oc_req_stateid.si_generation);
- COPYMEM(&open_conf->oc_req_stateid.si_opaque, sizeof(stateid_opaque_t));
+ status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid);
+ if (status)
+ return status;
+ READ_BUF(4);
READ32(open_conf->oc_seqid);
DECODE_TAIL;
@@ -713,9 +725,10 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d
DECODE_HEAD;
open_down->od_stateowner = NULL;
- READ_BUF(12 + sizeof(stateid_t));
- READ32(open_down->od_stateid.si_generation);
- COPYMEM(&open_down->od_stateid.si_opaque, sizeof(stateid_opaque_t));
+ status = nfsd4_decode_stateid(argp, &open_down->od_stateid);
+ if (status)
+ return status;
+ READ_BUF(12);
READ32(open_down->od_seqid);
READ32(open_down->od_share_access);
READ32(open_down->od_share_deny);
@@ -743,9 +756,10 @@ nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read)
{
DECODE_HEAD;
- READ_BUF(sizeof(stateid_t) + 12);
- READ32(read->rd_stateid.si_generation);
- COPYMEM(&read->rd_stateid.si_opaque, sizeof(stateid_opaque_t));
+ status = nfsd4_decode_stateid(argp, &read->rd_stateid);
+ if (status)
+ return status;
+ READ_BUF(12);
READ64(read->rd_offset);
READ32(read->rd_length);
@@ -834,15 +848,13 @@ nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
static __be32
nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr)
{
- DECODE_HEAD;
-
- READ_BUF(sizeof(stateid_t));
- READ32(setattr->sa_stateid.si_generation);
- COPYMEM(&setattr->sa_stateid.si_opaque, sizeof(stateid_opaque_t));
- if ((status = nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr, &setattr->sa_acl)))
- goto out;
+ __be32 status;
- DECODE_TAIL;
+ status = nfsd4_decode_stateid(argp, &setattr->sa_stateid);
+ if (status)
+ return status;
+ return nfsd4_decode_fattr(argp, setattr->sa_bmval,
+ &setattr->sa_iattr, &setattr->sa_acl);
}
static __be32
@@ -927,9 +939,10 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
int len;
DECODE_HEAD;
- READ_BUF(sizeof(stateid_opaque_t) + 20);
- READ32(write->wr_stateid.si_generation);
- COPYMEM(&write->wr_stateid.si_opaque, sizeof(stateid_opaque_t));
+ status = nfsd4_decode_stateid(argp, &write->wr_stateid);
+ if (status)
+ return status;
+ READ_BUF(16);
READ64(write->wr_offset);
READ32(write->wr_stable_how);
if (write->wr_stable_how > 2)
@@ -986,10 +999,74 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel
}
static __be32
+nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
+{
+ return nfs_ok;
+}
+
+static __be32
+nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p)
+{
+ return nfserr_opnotsupp;
+}
+
+typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *);
+
+static nfsd4_dec nfsd4_dec_ops[] = {
+ [OP_ACCESS] = (nfsd4_dec)nfsd4_decode_access,
+ [OP_CLOSE] = (nfsd4_dec)nfsd4_decode_close,
+ [OP_COMMIT] = (nfsd4_dec)nfsd4_decode_commit,
+ [OP_CREATE] = (nfsd4_dec)nfsd4_decode_create,
+ [OP_DELEGPURGE] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_DELEGRETURN] = (nfsd4_dec)nfsd4_decode_delegreturn,
+ [OP_GETATTR] = (nfsd4_dec)nfsd4_decode_getattr,
+ [OP_GETFH] = (nfsd4_dec)nfsd4_decode_noop,
+ [OP_LINK] = (nfsd4_dec)nfsd4_decode_link,
+ [OP_LOCK] = (nfsd4_dec)nfsd4_decode_lock,
+ [OP_LOCKT] = (nfsd4_dec)nfsd4_decode_lockt,
+ [OP_LOCKU] = (nfsd4_dec)nfsd4_decode_locku,
+ [OP_LOOKUP] = (nfsd4_dec)nfsd4_decode_lookup,
+ [OP_LOOKUPP] = (nfsd4_dec)nfsd4_decode_noop,
+ [OP_NVERIFY] = (nfsd4_dec)nfsd4_decode_verify,
+ [OP_OPEN] = (nfsd4_dec)nfsd4_decode_open,
+ [OP_OPENATTR] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_OPEN_CONFIRM] = (nfsd4_dec)nfsd4_decode_open_confirm,
+ [OP_OPEN_DOWNGRADE] = (nfsd4_dec)nfsd4_decode_open_downgrade,
+ [OP_PUTFH] = (nfsd4_dec)nfsd4_decode_putfh,
+ [OP_PUTPUBFH] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_PUTROOTFH] = (nfsd4_dec)nfsd4_decode_noop,
+ [OP_READ] = (nfsd4_dec)nfsd4_decode_read,
+ [OP_READDIR] = (nfsd4_dec)nfsd4_decode_readdir,
+ [OP_READLINK] = (nfsd4_dec)nfsd4_decode_noop,
+ [OP_REMOVE] = (nfsd4_dec)nfsd4_decode_remove,
+ [OP_RENAME] = (nfsd4_dec)nfsd4_decode_rename,
+ [OP_RENEW] = (nfsd4_dec)nfsd4_decode_renew,
+ [OP_RESTOREFH] = (nfsd4_dec)nfsd4_decode_noop,
+ [OP_SAVEFH] = (nfsd4_dec)nfsd4_decode_noop,
+ [OP_SECINFO] = (nfsd4_dec)nfsd4_decode_secinfo,
+ [OP_SETATTR] = (nfsd4_dec)nfsd4_decode_setattr,
+ [OP_SETCLIENTID] = (nfsd4_dec)nfsd4_decode_setclientid,
+ [OP_SETCLIENTID_CONFIRM] = (nfsd4_dec)nfsd4_decode_setclientid_confirm,
+ [OP_VERIFY] = (nfsd4_dec)nfsd4_decode_verify,
+ [OP_WRITE] = (nfsd4_dec)nfsd4_decode_write,
+ [OP_RELEASE_LOCKOWNER] = (nfsd4_dec)nfsd4_decode_release_lockowner,
+};
+
+struct nfsd4_minorversion_ops {
+ nfsd4_dec *decoders;
+ int nops;
+};
+
+static struct nfsd4_minorversion_ops nfsd4_minorversion[] = {
+ [0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) },
+};
+
+static __be32
nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
{
DECODE_HEAD;
struct nfsd4_op *op;
+ struct nfsd4_minorversion_ops *ops;
int i;
/*
@@ -1019,6 +1096,10 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
}
}
+ if (argp->minorversion >= ARRAY_SIZE(nfsd4_minorversion))
+ argp->opcnt = 0;
+
+ ops = &nfsd4_minorversion[argp->minorversion];
for (i = 0; i < argp->opcnt; i++) {
op = &argp->ops[i];
op->replay = NULL;
@@ -1056,120 +1137,11 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
}
op->opnum = ntohl(*argp->p++);
- switch (op->opnum) {
- case 2: /* Reserved operation */
- op->opnum = OP_ILLEGAL;
- if (argp->minorversion == 0)
- op->status = nfserr_op_illegal;
- else
- op->status = nfserr_minor_vers_mismatch;
- break;
- case OP_ACCESS:
- op->status = nfsd4_decode_access(argp, &op->u.access);
- break;
- case OP_CLOSE:
- op->status = nfsd4_decode_close(argp, &op->u.close);
- break;
- case OP_COMMIT:
- op->status = nfsd4_decode_commit(argp, &op->u.commit);
- break;
- case OP_CREATE:
- op->status = nfsd4_decode_create(argp, &op->u.create);
- break;
- case OP_DELEGRETURN:
- op->status = nfsd4_decode_delegreturn(argp, &op->u.delegreturn);
- break;
- case OP_GETATTR:
- op->status = nfsd4_decode_getattr(argp, &op->u.getattr);
- break;
- case OP_GETFH:
- op->status = nfs_ok;
- break;
- case OP_LINK:
- op->status = nfsd4_decode_link(argp, &op->u.link);
- break;
- case OP_LOCK:
- op->status = nfsd4_decode_lock(argp, &op->u.lock);
- break;
- case OP_LOCKT:
- op->status = nfsd4_decode_lockt(argp, &op->u.lockt);
- break;
- case OP_LOCKU:
- op->status = nfsd4_decode_locku(argp, &op->u.locku);
- break;
- case OP_LOOKUP:
- op->status = nfsd4_decode_lookup(argp, &op->u.lookup);
- break;
- case OP_LOOKUPP:
- op->status = nfs_ok;
- break;
- case OP_NVERIFY:
- op->status = nfsd4_decode_verify(argp, &op->u.nverify);
- break;
- case OP_OPEN:
- op->status = nfsd4_decode_open(argp, &op->u.open);
- break;
- case OP_OPEN_CONFIRM:
- op->status = nfsd4_decode_open_confirm(argp, &op->u.open_confirm);
- break;
- case OP_OPEN_DOWNGRADE:
- op->status = nfsd4_decode_open_downgrade(argp, &op->u.open_downgrade);
- break;
- case OP_PUTFH:
- op->status = nfsd4_decode_putfh(argp, &op->u.putfh);
- break;
- case OP_PUTROOTFH:
- op->status = nfs_ok;
- break;
- case OP_READ:
- op->status = nfsd4_decode_read(argp, &op->u.read);
- break;
- case OP_READDIR:
- op->status = nfsd4_decode_readdir(argp, &op->u.readdir);
- break;
- case OP_READLINK:
- op->status = nfs_ok;
- break;
- case OP_REMOVE:
- op->status = nfsd4_decode_remove(argp, &op->u.remove);
- break;
- case OP_RENAME:
- op->status = nfsd4_decode_rename(argp, &op->u.rename);
- break;
- case OP_RESTOREFH:
- op->status = nfs_ok;
- break;
- case OP_RENEW:
- op->status = nfsd4_decode_renew(argp, &op->u.renew);
- break;
- case OP_SAVEFH:
- op->status = nfs_ok;
- break;
- case OP_SECINFO:
- op->status = nfsd4_decode_secinfo(argp, &op->u.secinfo);
- break;
- case OP_SETATTR:
- op->status = nfsd4_decode_setattr(argp, &op->u.setattr);
- break;
- case OP_SETCLIENTID:
- op->status = nfsd4_decode_setclientid(argp, &op->u.setclientid);
- break;
- case OP_SETCLIENTID_CONFIRM:
- op->status = nfsd4_decode_setclientid_confirm(argp, &op->u.setclientid_confirm);
- break;
- case OP_VERIFY:
- op->status = nfsd4_decode_verify(argp, &op->u.verify);
- break;
- case OP_WRITE:
- op->status = nfsd4_decode_write(argp, &op->u.write);
- break;
- case OP_RELEASE_LOCKOWNER:
- op->status = nfsd4_decode_release_lockowner(argp, &op->u.release_lockowner);
- break;
- default:
+ if (op->opnum >= OP_ACCESS && op->opnum < ops->nops)
+ op->status = ops->decoders[op->opnum](argp, &op->u);
+ else {
op->opnum = OP_ILLEGAL;
op->status = nfserr_op_illegal;
- break;
}
if (op->status) {
@@ -1201,11 +1173,11 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
*p++ = htonl((u32)((n) >> 32)); \
*p++ = htonl((u32)(n)); \
} while (0)
-#define WRITEMEM(ptr,nbytes) do { \
+#define WRITEMEM(ptr,nbytes) do { if (nbytes > 0) { \
*(p + XDR_QUADLEN(nbytes) -1) = 0; \
memcpy(p, ptr, nbytes); \
p += XDR_QUADLEN(nbytes); \
-} while (0)
+}} while (0)
#define WRITECINFO(c) do { \
*p++ = htonl(c.atomic); \
*p++ = htonl(c.before_ctime_sec); \
@@ -1224,7 +1196,6 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
* Header routine to setup seqid operation replay cache
*/
#define ENCODE_SEQID_OP_HEAD \
- __be32 *p; \
__be32 *save; \
\
save = resp->p;
@@ -1992,6 +1963,17 @@ fail:
}
static void
+nfsd4_encode_stateid(struct nfsd4_compoundres *resp, stateid_t *sid)
+{
+ ENCODE_HEAD;
+
+ RESERVE_SPACE(sizeof(stateid_t));
+ WRITE32(sid->si_generation);
+ WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
+ ADJUST_ARGS();
+}
+
+static __be32
nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access)
{
ENCODE_HEAD;
@@ -2002,24 +1984,23 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
WRITE32(access->ac_resp_access);
ADJUST_ARGS();
}
+ return nfserr;
}
-static void
+static __be32
nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close)
{
ENCODE_SEQID_OP_HEAD;
- if (!nfserr) {
- RESERVE_SPACE(sizeof(stateid_t));
- WRITE32(close->cl_stateid.si_generation);
- WRITEMEM(&close->cl_stateid.si_opaque, sizeof(stateid_opaque_t));
- ADJUST_ARGS();
- }
+ if (!nfserr)
+ nfsd4_encode_stateid(resp, &close->cl_stateid);
+
ENCODE_SEQID_OP_TAIL(close->cl_stateowner);
+ return nfserr;
}
-static void
+static __be32
nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_commit *commit)
{
ENCODE_HEAD;
@@ -2029,9 +2010,10 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
WRITEMEM(commit->co_verf.data, 8);
ADJUST_ARGS();
}
+ return nfserr;
}
-static void
+static __be32
nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create *create)
{
ENCODE_HEAD;
@@ -2044,6 +2026,7 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
WRITE32(create->cr_bmval[1]);
ADJUST_ARGS();
}
+ return nfserr;
}
static __be32
@@ -2064,9 +2047,10 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
return nfserr;
}
-static void
-nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh *fhp)
+static __be32
+nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh **fhpp)
{
+ struct svc_fh *fhp = *fhpp;
unsigned int len;
ENCODE_HEAD;
@@ -2077,6 +2061,7 @@ nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh
WRITEMEM(&fhp->fh_handle.fh_base, len);
ADJUST_ARGS();
}
+ return nfserr;
}
/*
@@ -2104,46 +2089,42 @@ nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denie
ADJUST_ARGS();
}
-static void
+static __be32
nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lock *lock)
{
ENCODE_SEQID_OP_HEAD;
- if (!nfserr) {
- RESERVE_SPACE(4 + sizeof(stateid_t));
- WRITE32(lock->lk_resp_stateid.si_generation);
- WRITEMEM(&lock->lk_resp_stateid.si_opaque, sizeof(stateid_opaque_t));
- ADJUST_ARGS();
- } else if (nfserr == nfserr_denied)
+ if (!nfserr)
+ nfsd4_encode_stateid(resp, &lock->lk_resp_stateid);
+ else if (nfserr == nfserr_denied)
nfsd4_encode_lock_denied(resp, &lock->lk_denied);
ENCODE_SEQID_OP_TAIL(lock->lk_replay_owner);
+ return nfserr;
}
-static void
+static __be32
nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lockt *lockt)
{
if (nfserr == nfserr_denied)
nfsd4_encode_lock_denied(resp, &lockt->lt_denied);
+ return nfserr;
}
-static void
+static __be32
nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_locku *locku)
{
ENCODE_SEQID_OP_HEAD;
- if (!nfserr) {
- RESERVE_SPACE(sizeof(stateid_t));
- WRITE32(locku->lu_stateid.si_generation);
- WRITEMEM(&locku->lu_stateid.si_opaque, sizeof(stateid_opaque_t));
- ADJUST_ARGS();
- }
-
+ if (!nfserr)
+ nfsd4_encode_stateid(resp, &locku->lu_stateid);
+
ENCODE_SEQID_OP_TAIL(locku->lu_stateowner);
+ return nfserr;
}
-static void
+static __be32
nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_link *link)
{
ENCODE_HEAD;
@@ -2153,20 +2134,21 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_li
WRITECINFO(link->li_cinfo);
ADJUST_ARGS();
}
+ return nfserr;
}
-static void
+static __be32
nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open)
{
+ ENCODE_HEAD;
ENCODE_SEQID_OP_HEAD;
if (nfserr)
goto out;
- RESERVE_SPACE(36 + sizeof(stateid_t));
- WRITE32(open->op_stateid.si_generation);
- WRITEMEM(&open->op_stateid.si_opaque, sizeof(stateid_opaque_t));
+ nfsd4_encode_stateid(resp, &open->op_stateid);
+ RESERVE_SPACE(40);
WRITECINFO(open->op_cinfo);
WRITE32(open->op_rflags);
WRITE32(2);
@@ -2179,8 +2161,8 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
case NFS4_OPEN_DELEGATE_NONE:
break;
case NFS4_OPEN_DELEGATE_READ:
- RESERVE_SPACE(20 + sizeof(stateid_t));
- WRITEMEM(&open->op_delegate_stateid, sizeof(stateid_t));
+ nfsd4_encode_stateid(resp, &open->op_delegate_stateid);
+ RESERVE_SPACE(20);
WRITE32(open->op_recall);
/*
@@ -2193,8 +2175,8 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
ADJUST_ARGS();
break;
case NFS4_OPEN_DELEGATE_WRITE:
- RESERVE_SPACE(32 + sizeof(stateid_t));
- WRITEMEM(&open->op_delegate_stateid, sizeof(stateid_t));
+ nfsd4_encode_stateid(resp, &open->op_delegate_stateid);
+ RESERVE_SPACE(32);
WRITE32(0);
/*
@@ -2219,36 +2201,31 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
/* XXX save filehandle here */
out:
ENCODE_SEQID_OP_TAIL(open->op_stateowner);
+ return nfserr;
}
-static void
+static __be32
nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc)
{
ENCODE_SEQID_OP_HEAD;
-
- if (!nfserr) {
- RESERVE_SPACE(sizeof(stateid_t));
- WRITE32(oc->oc_resp_stateid.si_generation);
- WRITEMEM(&oc->oc_resp_stateid.si_opaque, sizeof(stateid_opaque_t));
- ADJUST_ARGS();
- }
+
+ if (!nfserr)
+ nfsd4_encode_stateid(resp, &oc->oc_resp_stateid);
ENCODE_SEQID_OP_TAIL(oc->oc_stateowner);
+ return nfserr;
}
-static void
+static __be32
nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od)
{
ENCODE_SEQID_OP_HEAD;
-
- if (!nfserr) {
- RESERVE_SPACE(sizeof(stateid_t));
- WRITE32(od->od_stateid.si_generation);
- WRITEMEM(&od->od_stateid.si_opaque, sizeof(stateid_opaque_t));
- ADJUST_ARGS();
- }
+
+ if (!nfserr)
+ nfsd4_encode_stateid(resp, &od->od_stateid);
ENCODE_SEQID_OP_TAIL(od->od_stateowner);
+ return nfserr;
}
static __be32
@@ -2443,7 +2420,7 @@ err_no_verf:
return nfserr;
}
-static void
+static __be32
nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_remove *remove)
{
ENCODE_HEAD;
@@ -2453,9 +2430,10 @@ nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
WRITECINFO(remove->rm_cinfo);
ADJUST_ARGS();
}
+ return nfserr;
}
-static void
+static __be32
nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_rename *rename)
{
ENCODE_HEAD;
@@ -2466,9 +2444,10 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
WRITECINFO(rename->rn_tinfo);
ADJUST_ARGS();
}
+ return nfserr;
}
-static void
+static __be32
nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_secinfo *secinfo)
{
@@ -2532,13 +2511,14 @@ nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
out:
if (exp)
exp_put(exp);
+ return nfserr;
}
/*
* The SETATTR encode routine is special -- it always encodes a bitmap,
* regardless of the error status.
*/
-static void
+static __be32
nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setattr *setattr)
{
ENCODE_HEAD;
@@ -2555,9 +2535,10 @@ nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
WRITE32(setattr->sa_bmval[1]);
}
ADJUST_ARGS();
+ return nfserr;
}
-static void
+static __be32
nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setclientid *scd)
{
ENCODE_HEAD;
@@ -2574,9 +2555,10 @@ nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct n
WRITE32(0);
ADJUST_ARGS();
}
+ return nfserr;
}
-static void
+static __be32
nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_write *write)
{
ENCODE_HEAD;
@@ -2588,8 +2570,56 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w
WRITEMEM(write->wr_verifier.data, 8);
ADJUST_ARGS();
}
+ return nfserr;
}
+static __be32
+nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
+{
+ return nfserr;
+}
+
+typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *);
+
+static nfsd4_enc nfsd4_enc_ops[] = {
+ [OP_ACCESS] = (nfsd4_enc)nfsd4_encode_access,
+ [OP_CLOSE] = (nfsd4_enc)nfsd4_encode_close,
+ [OP_COMMIT] = (nfsd4_enc)nfsd4_encode_commit,
+ [OP_CREATE] = (nfsd4_enc)nfsd4_encode_create,
+ [OP_DELEGPURGE] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_DELEGRETURN] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_GETATTR] = (nfsd4_enc)nfsd4_encode_getattr,
+ [OP_GETFH] = (nfsd4_enc)nfsd4_encode_getfh,
+ [OP_LINK] = (nfsd4_enc)nfsd4_encode_link,
+ [OP_LOCK] = (nfsd4_enc)nfsd4_encode_lock,
+ [OP_LOCKT] = (nfsd4_enc)nfsd4_encode_lockt,
+ [OP_LOCKU] = (nfsd4_enc)nfsd4_encode_locku,
+ [OP_LOOKUP] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_LOOKUPP] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_NVERIFY] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_OPEN] = (nfsd4_enc)nfsd4_encode_open,
+ [OP_OPEN_CONFIRM] = (nfsd4_enc)nfsd4_encode_open_confirm,
+ [OP_OPEN_DOWNGRADE] = (nfsd4_enc)nfsd4_encode_open_downgrade,
+ [OP_PUTFH] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_PUTPUBFH] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_PUTROOTFH] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_READ] = (nfsd4_enc)nfsd4_encode_read,
+ [OP_READDIR] = (nfsd4_enc)nfsd4_encode_readdir,
+ [OP_READLINK] = (nfsd4_enc)nfsd4_encode_readlink,
+ [OP_REMOVE] = (nfsd4_enc)nfsd4_encode_remove,
+ [OP_RENAME] = (nfsd4_enc)nfsd4_encode_rename,
+ [OP_RENEW] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_RESTOREFH] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_SAVEFH] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_SECINFO] = (nfsd4_enc)nfsd4_encode_secinfo,
+ [OP_SETATTR] = (nfsd4_enc)nfsd4_encode_setattr,
+ [OP_SETCLIENTID] = (nfsd4_enc)nfsd4_encode_setclientid,
+ [OP_SETCLIENTID_CONFIRM] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_VERIFY] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_WRITE] = (nfsd4_enc)nfsd4_encode_write,
+ [OP_RELEASE_LOCKOWNER] = (nfsd4_enc)nfsd4_encode_noop,
+};
+
void
nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
{
@@ -2601,101 +2631,12 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
statp = p++; /* to be backfilled at the end */
ADJUST_ARGS();
- switch (op->opnum) {
- case OP_ACCESS:
- nfsd4_encode_access(resp, op->status, &op->u.access);
- break;
- case OP_CLOSE:
- nfsd4_encode_close(resp, op->status, &op->u.close);
- break;
- case OP_COMMIT:
- nfsd4_encode_commit(resp, op->status, &op->u.commit);
- break;
- case OP_CREATE:
- nfsd4_encode_create(resp, op->status, &op->u.create);
- break;
- case OP_DELEGRETURN:
- break;
- case OP_GETATTR:
- op->status = nfsd4_encode_getattr(resp, op->status, &op->u.getattr);
- break;
- case OP_GETFH:
- nfsd4_encode_getfh(resp, op->status, op->u.getfh);
- break;
- case OP_LINK:
- nfsd4_encode_link(resp, op->status, &op->u.link);
- break;
- case OP_LOCK:
- nfsd4_encode_lock(resp, op->status, &op->u.lock);
- break;
- case OP_LOCKT:
- nfsd4_encode_lockt(resp, op->status, &op->u.lockt);
- break;
- case OP_LOCKU:
- nfsd4_encode_locku(resp, op->status, &op->u.locku);
- break;
- case OP_LOOKUP:
- break;
- case OP_LOOKUPP:
- break;
- case OP_NVERIFY:
- break;
- case OP_OPEN:
- nfsd4_encode_open(resp, op->status, &op->u.open);
- break;
- case OP_OPEN_CONFIRM:
- nfsd4_encode_open_confirm(resp, op->status, &op->u.open_confirm);
- break;
- case OP_OPEN_DOWNGRADE:
- nfsd4_encode_open_downgrade(resp, op->status, &op->u.open_downgrade);
- break;
- case OP_PUTFH:
- break;
- case OP_PUTROOTFH:
- break;
- case OP_READ:
- op->status = nfsd4_encode_read(resp, op->status, &op->u.read);
- break;
- case OP_READDIR:
- op->status = nfsd4_encode_readdir(resp, op->status, &op->u.readdir);
- break;
- case OP_READLINK:
- op->status = nfsd4_encode_readlink(resp, op->status, &op->u.readlink);
- break;
- case OP_REMOVE:
- nfsd4_encode_remove(resp, op->status, &op->u.remove);
- break;
- case OP_RENAME:
- nfsd4_encode_rename(resp, op->status, &op->u.rename);
- break;
- case OP_RENEW:
- break;
- case OP_RESTOREFH:
- break;
- case OP_SAVEFH:
- break;
- case OP_SECINFO:
- nfsd4_encode_secinfo(resp, op->status, &op->u.secinfo);
- break;
- case OP_SETATTR:
- nfsd4_encode_setattr(resp, op->status, &op->u.setattr);
- break;
- case OP_SETCLIENTID:
- nfsd4_encode_setclientid(resp, op->status, &op->u.setclientid);
- break;
- case OP_SETCLIENTID_CONFIRM:
- break;
- case OP_VERIFY:
- break;
- case OP_WRITE:
- nfsd4_encode_write(resp, op->status, &op->u.write);
- break;
- case OP_RELEASE_LOCKOWNER:
- break;
- default:
- break;
- }
-
+ if (op->opnum == OP_ILLEGAL)
+ goto status;
+ BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) ||
+ !nfsd4_enc_ops[op->opnum]);
+ op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u);
+status:
/*
* Note: We write the status directly, instead of using WRITE32(),
* since it is already in network byte order.
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 5ac00c4fee9..97543df5824 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -12,6 +12,7 @@
#include <linux/time.h>
#include <linux/errno.h>
#include <linux/fs.h>
+#include <linux/namei.h>
#include <linux/fcntl.h>
#include <linux/net.h>
#include <linux/in.h>
@@ -310,9 +311,12 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size)
static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size)
{
- __be32 server_ip;
- char *fo_path, c;
+ struct sockaddr_in sin = {
+ .sin_family = AF_INET,
+ };
int b1, b2, b3, b4;
+ char c;
+ char *fo_path;
/* sanity check */
if (size == 0)
@@ -326,11 +330,13 @@ static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size)
return -EINVAL;
/* get ipv4 address */
- if (sscanf(fo_path, "%u.%u.%u.%u%c", &b1, &b2, &b3, &b4, &c) != 4)
+ if (sscanf(fo_path, NIPQUAD_FMT "%c", &b1, &b2, &b3, &b4, &c) != 4)
return -EINVAL;
- server_ip = htonl((((((b1<<8)|b2)<<8)|b3)<<8)|b4);
+ if (b1 > 255 || b2 > 255 || b3 > 255 || b4 > 255)
+ return -EINVAL;
+ sin.sin_addr.s_addr = htonl((b1 << 24) | (b2 << 16) | (b3 << 8) | b4);
- return nlmsvc_unlock_all_by_ip(server_ip);
+ return nlmsvc_unlock_all_by_ip((struct sockaddr *)&sin);
}
static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size)
@@ -450,22 +456,26 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
int i;
int rv;
int len;
- int npools = nfsd_nrpools();
+ int npools;
int *nthreads;
+ mutex_lock(&nfsd_mutex);
+ npools = nfsd_nrpools();
if (npools == 0) {
/*
* NFS is shut down. The admin can start it by
* writing to the threads file but NOT the pool_threads
* file, sorry. Report zero threads.
*/
+ mutex_unlock(&nfsd_mutex);
strcpy(buf, "0\n");
return strlen(buf);
}
nthreads = kcalloc(npools, sizeof(int), GFP_KERNEL);
+ rv = -ENOMEM;
if (nthreads == NULL)
- return -ENOMEM;
+ goto out_free;
if (size > 0) {
for (i = 0; i < npools; i++) {
@@ -496,14 +506,16 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
mesg += len;
}
+ mutex_unlock(&nfsd_mutex);
return (mesg-buf);
out_free:
kfree(nthreads);
+ mutex_unlock(&nfsd_mutex);
return rv;
}
-static ssize_t write_versions(struct file *file, char *buf, size_t size)
+static ssize_t __write_versions(struct file *file, char *buf, size_t size)
{
/*
* Format:
@@ -566,14 +578,23 @@ static ssize_t write_versions(struct file *file, char *buf, size_t size)
return len;
}
-static ssize_t write_ports(struct file *file, char *buf, size_t size)
+static ssize_t write_versions(struct file *file, char *buf, size_t size)
+{
+ ssize_t rv;
+
+ mutex_lock(&nfsd_mutex);
+ rv = __write_versions(file, buf, size);
+ mutex_unlock(&nfsd_mutex);
+ return rv;
+}
+
+static ssize_t __write_ports(struct file *file, char *buf, size_t size)
{
if (size == 0) {
int len = 0;
- lock_kernel();
+
if (nfsd_serv)
len = svc_xprt_names(nfsd_serv, buf, 0);
- unlock_kernel();
return len;
}
/* Either a single 'fd' number is written, in which
@@ -593,19 +614,16 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
return -EINVAL;
err = nfsd_create_serv();
if (!err) {
- int proto = 0;
- err = svc_addsock(nfsd_serv, fd, buf, &proto);
+ err = svc_addsock(nfsd_serv, fd, buf);
if (err >= 0) {
- err = lockd_up(proto);
+ err = lockd_up();
if (err < 0)
svc_sock_names(buf+strlen(buf)+1, nfsd_serv, buf);
}
/* Decrease the count, but don't shutdown the
* the service
*/
- lock_kernel();
nfsd_serv->sv_nrthreads--;
- unlock_kernel();
}
return err < 0 ? err : 0;
}
@@ -614,10 +632,8 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
int len = 0;
if (!toclose)
return -ENOMEM;
- lock_kernel();
if (nfsd_serv)
len = svc_sock_names(buf, nfsd_serv, toclose);
- unlock_kernel();
if (len >= 0)
lockd_down();
kfree(toclose);
@@ -655,7 +671,6 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
if (sscanf(&buf[1], "%15s %4d", transport, &port) == 2) {
if (port == 0)
return -EINVAL;
- lock_kernel();
if (nfsd_serv) {
xprt = svc_find_xprt(nfsd_serv, transport,
AF_UNSPEC, port);
@@ -666,13 +681,23 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size)
} else
err = -ENOTCONN;
}
- unlock_kernel();
return err < 0 ? err : 0;
}
}
return -EINVAL;
}
+static ssize_t write_ports(struct file *file, char *buf, size_t size)
+{
+ ssize_t rv;
+
+ mutex_lock(&nfsd_mutex);
+ rv = __write_ports(file, buf, size);
+ mutex_unlock(&nfsd_mutex);
+ return rv;
+}
+
+
int nfsd_max_blksize;
static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
@@ -691,13 +716,13 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
if (bsize > NFSSVC_MAXBLKSIZE)
bsize = NFSSVC_MAXBLKSIZE;
bsize &= ~(1024-1);
- lock_kernel();
+ mutex_lock(&nfsd_mutex);
if (nfsd_serv && nfsd_serv->sv_nrthreads) {
- unlock_kernel();
+ mutex_unlock(&nfsd_mutex);
return -EBUSY;
}
nfsd_max_blksize = bsize;
- unlock_kernel();
+ mutex_unlock(&nfsd_mutex);
}
return sprintf(buf, "%d\n", nfsd_max_blksize);
}
@@ -705,16 +730,17 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
#ifdef CONFIG_NFSD_V4
extern time_t nfs4_leasetime(void);
-static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
+static ssize_t __write_leasetime(struct file *file, char *buf, size_t size)
{
/* if size > 10 seconds, call
* nfs4_reset_lease() then write out the new lease (seconds) as reply
*/
char *mesg = buf;
- int rv;
+ int rv, lease;
if (size > 0) {
- int lease;
+ if (nfsd_serv)
+ return -EBUSY;
rv = get_int(&mesg, &lease);
if (rv)
return rv;
@@ -726,24 +752,52 @@ static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
return strlen(buf);
}
-static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
+static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
+{
+ ssize_t rv;
+
+ mutex_lock(&nfsd_mutex);
+ rv = __write_leasetime(file, buf, size);
+ mutex_unlock(&nfsd_mutex);
+ return rv;
+}
+
+extern char *nfs4_recoverydir(void);
+
+static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size)
{
char *mesg = buf;
char *recdir;
int len, status;
- if (size == 0 || size > PATH_MAX || buf[size-1] != '\n')
- return -EINVAL;
- buf[size-1] = 0;
+ if (size > 0) {
+ if (nfsd_serv)
+ return -EBUSY;
+ if (size > PATH_MAX || buf[size-1] != '\n')
+ return -EINVAL;
+ buf[size-1] = 0;
- recdir = mesg;
- len = qword_get(&mesg, recdir, size);
- if (len <= 0)
- return -EINVAL;
+ recdir = mesg;
+ len = qword_get(&mesg, recdir, size);
+ if (len <= 0)
+ return -EINVAL;
- status = nfs4_reset_recoverydir(recdir);
+ status = nfs4_reset_recoverydir(recdir);
+ }
+ sprintf(buf, "%s\n", nfs4_recoverydir());
return strlen(buf);
}
+
+static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
+{
+ ssize_t rv;
+
+ mutex_lock(&nfsd_mutex);
+ rv = __write_recoverydir(file, buf, size);
+ mutex_unlock(&nfsd_mutex);
+ return rv;
+}
+
#endif
/*----------------------------------------------------------------------------*/
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 100ae564116..cd25d91895a 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -51,7 +51,7 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry)
/* make sure parents give x permission to user */
int err;
parent = dget_parent(tdentry);
- err = permission(parent->d_inode, MAY_EXEC, NULL);
+ err = inode_permission(parent->d_inode, MAY_EXEC);
if (err < 0) {
dput(parent);
break;
@@ -176,9 +176,24 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
if (IS_ERR(exp))
return nfserrno(PTR_ERR(exp));
- error = nfsd_setuser_and_check_port(rqstp, exp);
- if (error)
- goto out;
+ if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) {
+ /* Elevate privileges so that the lack of 'r' or 'x'
+ * permission on some parent directory will
+ * not stop exportfs_decode_fh from being able
+ * to reconnect a directory into the dentry cache.
+ * The same problem can affect "SUBTREECHECK" exports,
+ * but as nfsd_acceptable depends on correct
+ * access control settings being in effect, we cannot
+ * fix that case easily.
+ */
+ current->cap_effective =
+ cap_raise_nfsd_set(current->cap_effective,
+ current->cap_permitted);
+ } else {
+ error = nfsd_setuser_and_check_port(rqstp, exp);
+ if (error)
+ goto out;
+ }
/*
* Look up the dentry using the NFS file handle.
@@ -215,6 +230,14 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
goto out;
}
+ if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) {
+ error = nfsd_setuser_and_check_port(rqstp, exp);
+ if (error) {
+ dput(dentry);
+ goto out;
+ }
+ }
+
if (S_ISDIR(dentry->d_inode->i_mode) &&
(dentry->d_flags & DCACHE_DISCONNECTED)) {
printk("nfsd: find_fh_dentry returned a DISCONNECTED directory: %s/%s\n",
@@ -279,17 +302,27 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
if (error)
goto out;
- if (!(access & MAY_LOCK)) {
- /*
- * pseudoflavor restrictions are not enforced on NLM,
- * which clients virtually always use auth_sys for,
- * even while using RPCSEC_GSS for NFS.
- */
- error = check_nfsd_access(exp, rqstp);
- if (error)
- goto out;
- }
+ /*
+ * pseudoflavor restrictions are not enforced on NLM,
+ * which clients virtually always use auth_sys for,
+ * even while using RPCSEC_GSS for NFS.
+ */
+ if (access & NFSD_MAY_LOCK)
+ goto skip_pseudoflavor_check;
+ /*
+ * Clients may expect to be able to use auth_sys during mount,
+ * even if they use gss for everything else; see section 2.3.2
+ * of rfc 2623.
+ */
+ if (access & NFSD_MAY_BYPASS_GSS_ON_ROOT
+ && exp->ex_path.dentry == dentry)
+ goto skip_pseudoflavor_check;
+
+ error = check_nfsd_access(exp, rqstp);
+ if (error)
+ goto out;
+skip_pseudoflavor_check:
/* Finally, check access permissions. */
error = nfsd_permission(rqstp, exp, dentry, access);
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 6cfc96a1248..5cffeca7ace 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -65,7 +65,8 @@ nfsd_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp,
dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh));
fh_copy(&resp->fh, &argp->fh);
- nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP);
+ nfserr = fh_verify(rqstp, &resp->fh, 0,
+ NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT);
return nfsd_return_attrs(nfserr, resp);
}
@@ -215,11 +216,11 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
SVCFH_fmt(dirfhp), argp->len, argp->name);
/* First verify the parent file handle */
- nfserr = fh_verify(rqstp, dirfhp, S_IFDIR, MAY_EXEC);
+ nfserr = fh_verify(rqstp, dirfhp, S_IFDIR, NFSD_MAY_EXEC);
if (nfserr)
goto done; /* must fh_put dirfhp even on error */
- /* Check for MAY_WRITE in nfsd_create if necessary */
+ /* Check for NFSD_MAY_WRITE in nfsd_create if necessary */
nfserr = nfserr_acces;
if (!argp->len)
@@ -281,7 +282,7 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
nfserr = nfsd_permission(rqstp,
newfhp->fh_export,
newfhp->fh_dentry,
- MAY_WRITE|MAY_LOCAL_ACCESS);
+ NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS);
if (nfserr && nfserr != nfserr_rofs)
goto out_unlock;
}
@@ -521,7 +522,8 @@ nfsd_proc_statfs(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
dprintk("nfsd: STATFS %s\n", SVCFH_fmt(&argp->fh));
- nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats);
+ nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats,
+ NFSD_MAY_BYPASS_GSS_ON_ROOT);
fh_put(&argp->fh);
return nfserr;
}
@@ -614,6 +616,7 @@ nfserrno (int errno)
#endif
{ nfserr_stale, -ESTALE },
{ nfserr_jukebox, -ETIMEDOUT },
+ { nfserr_jukebox, -ERESTARTSYS },
{ nfserr_dropit, -EAGAIN },
{ nfserr_dropit, -ENOMEM },
{ nfserr_badname, -ESRCH },
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 941041f4b13..59eeb46f82c 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -21,6 +21,7 @@
#include <linux/smp_lock.h>
#include <linux/freezer.h>
#include <linux/fs_struct.h>
+#include <linux/kthread.h>
#include <linux/sunrpc/types.h>
#include <linux/sunrpc/stats.h>
@@ -36,28 +37,38 @@
#define NFSDDBG_FACILITY NFSDDBG_SVC
-/* these signals will be delivered to an nfsd thread
- * when handling a request
- */
-#define ALLOWED_SIGS (sigmask(SIGKILL))
-/* these signals will be delivered to an nfsd thread
- * when not handling a request. i.e. when waiting
- */
-#define SHUTDOWN_SIGS (sigmask(SIGKILL) | sigmask(SIGHUP) | sigmask(SIGINT) | sigmask(SIGQUIT))
-/* if the last thread dies with SIGHUP, then the exports table is
- * left unchanged ( like 2.4-{0-9} ). Any other signal will clear
- * the exports table (like 2.2).
- */
-#define SIG_NOCLEAN SIGHUP
-
extern struct svc_program nfsd_program;
-static void nfsd(struct svc_rqst *rqstp);
+static int nfsd(void *vrqstp);
struct timeval nfssvc_boot;
- struct svc_serv *nfsd_serv;
static atomic_t nfsd_busy;
static unsigned long nfsd_last_call;
static DEFINE_SPINLOCK(nfsd_call_lock);
+/*
+ * nfsd_mutex protects nfsd_serv -- both the pointer itself and the members
+ * of the svc_serv struct. In particular, ->sv_nrthreads but also to some
+ * extent ->sv_temp_socks and ->sv_permsocks. It also protects nfsdstats.th_cnt
+ *
+ * If (out side the lock) nfsd_serv is non-NULL, then it must point to a
+ * properly initialised 'struct svc_serv' with ->sv_nrthreads > 0. That number
+ * of nfsd threads must exist and each must listed in ->sp_all_threads in each
+ * entry of ->sv_pools[].
+ *
+ * Transitions of the thread count between zero and non-zero are of particular
+ * interest since the svc_serv needs to be created and initialized at that
+ * point, or freed.
+ *
+ * Finally, the nfsd_mutex also protects some of the global variables that are
+ * accessed when nfsd starts and that are settable via the write_* routines in
+ * nfsctl.c. In particular:
+ *
+ * user_recovery_dirname
+ * user_lease_time
+ * nfsd_versions
+ */
+DEFINE_MUTEX(nfsd_mutex);
+struct svc_serv *nfsd_serv;
+
#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
static struct svc_stat nfsd_acl_svcstats;
static struct svc_version * nfsd_acl_version[] = {
@@ -145,13 +156,14 @@ int nfsd_vers(int vers, enum vers_op change)
int nfsd_nrthreads(void)
{
- if (nfsd_serv == NULL)
- return 0;
- else
- return nfsd_serv->sv_nrthreads;
+ int rv = 0;
+ mutex_lock(&nfsd_mutex);
+ if (nfsd_serv)
+ rv = nfsd_serv->sv_nrthreads;
+ mutex_unlock(&nfsd_mutex);
+ return rv;
}
-static int killsig; /* signal that was used to kill last nfsd */
static void nfsd_last_thread(struct svc_serv *serv)
{
/* When last nfsd thread exits we need to do some clean-up */
@@ -162,11 +174,9 @@ static void nfsd_last_thread(struct svc_serv *serv)
nfsd_racache_shutdown();
nfs4_state_shutdown();
- printk(KERN_WARNING "nfsd: last server has exited\n");
- if (killsig != SIG_NOCLEAN) {
- printk(KERN_WARNING "nfsd: unexporting all filesystems\n");
- nfsd_export_flush();
- }
+ printk(KERN_WARNING "nfsd: last server has exited, flushing export "
+ "cache\n");
+ nfsd_export_flush();
}
void nfsd_reset_versions(void)
@@ -190,13 +200,14 @@ void nfsd_reset_versions(void)
}
}
+
int nfsd_create_serv(void)
{
int err = 0;
- lock_kernel();
+
+ WARN_ON(!mutex_is_locked(&nfsd_mutex));
if (nfsd_serv) {
svc_get(nfsd_serv);
- unlock_kernel();
return 0;
}
if (nfsd_max_blksize == 0) {
@@ -217,13 +228,12 @@ int nfsd_create_serv(void)
}
atomic_set(&nfsd_busy, 0);
- nfsd_serv = svc_create_pooled(&nfsd_program,
- nfsd_max_blksize,
- nfsd_last_thread,
- nfsd, SIG_NOCLEAN, THIS_MODULE);
+ nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
+ AF_INET,
+ nfsd_last_thread, nfsd, THIS_MODULE);
if (nfsd_serv == NULL)
err = -ENOMEM;
- unlock_kernel();
+
do_gettimeofday(&nfssvc_boot); /* record boot time */
return err;
}
@@ -234,25 +244,20 @@ static int nfsd_init_socks(int port)
if (!list_empty(&nfsd_serv->sv_permsocks))
return 0;
- error = lockd_up(IPPROTO_UDP);
- if (error >= 0) {
- error = svc_create_xprt(nfsd_serv, "udp", port,
+ error = svc_create_xprt(nfsd_serv, "udp", port,
SVC_SOCK_DEFAULTS);
- if (error < 0)
- lockd_down();
- }
if (error < 0)
return error;
- error = lockd_up(IPPROTO_TCP);
- if (error >= 0) {
- error = svc_create_xprt(nfsd_serv, "tcp", port,
+ error = svc_create_xprt(nfsd_serv, "tcp", port,
SVC_SOCK_DEFAULTS);
- if (error < 0)
- lockd_down();
- }
if (error < 0)
return error;
+
+ error = lockd_up();
+ if (error < 0)
+ return error;
+
return 0;
}
@@ -282,6 +287,8 @@ int nfsd_set_nrthreads(int n, int *nthreads)
int tot = 0;
int err = 0;
+ WARN_ON(!mutex_is_locked(&nfsd_mutex));
+
if (nfsd_serv == NULL || n <= 0)
return 0;
@@ -316,7 +323,6 @@ int nfsd_set_nrthreads(int n, int *nthreads)
nthreads[0] = 1;
/* apply the new numbers */
- lock_kernel();
svc_get(nfsd_serv);
for (i = 0; i < n; i++) {
err = svc_set_num_threads(nfsd_serv, &nfsd_serv->sv_pools[i],
@@ -325,7 +331,6 @@ int nfsd_set_nrthreads(int n, int *nthreads)
break;
}
svc_destroy(nfsd_serv);
- unlock_kernel();
return err;
}
@@ -334,8 +339,8 @@ int
nfsd_svc(unsigned short port, int nrservs)
{
int error;
-
- lock_kernel();
+
+ mutex_lock(&nfsd_mutex);
dprintk("nfsd: creating service\n");
error = -EINVAL;
if (nrservs <= 0)
@@ -363,7 +368,7 @@ nfsd_svc(unsigned short port, int nrservs)
failure:
svc_destroy(nfsd_serv); /* Release server */
out:
- unlock_kernel();
+ mutex_unlock(&nfsd_mutex);
return error;
}
@@ -391,18 +396,17 @@ update_thread_usage(int busy_threads)
/*
* This is the NFS server kernel thread
*/
-static void
-nfsd(struct svc_rqst *rqstp)
+static int
+nfsd(void *vrqstp)
{
+ struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp;
struct fs_struct *fsp;
- int err;
- sigset_t shutdown_mask, allowed_mask;
+ int err, preverr = 0;
/* Lock module and set up kernel thread */
- lock_kernel();
- daemonize("nfsd");
+ mutex_lock(&nfsd_mutex);
- /* After daemonize() this kernel thread shares current->fs
+ /* At this point, the thread shares current->fs
* with the init process. We need to create files with a
* umask of 0 instead of init's umask. */
fsp = copy_fs_struct(current->fs);
@@ -414,14 +418,17 @@ nfsd(struct svc_rqst *rqstp)
current->fs = fsp;
current->fs->umask = 0;
- siginitsetinv(&shutdown_mask, SHUTDOWN_SIGS);
- siginitsetinv(&allowed_mask, ALLOWED_SIGS);
+ /*
+ * thread is spawned with all signals set to SIG_IGN, re-enable
+ * the ones that will bring down the thread
+ */
+ allow_signal(SIGKILL);
+ allow_signal(SIGHUP);
+ allow_signal(SIGINT);
+ allow_signal(SIGQUIT);
nfsdstats.th_cnt++;
-
- rqstp->rq_task = current;
-
- unlock_kernel();
+ mutex_unlock(&nfsd_mutex);
/*
* We want less throttling in balance_dirty_pages() so that nfs to
@@ -435,26 +442,30 @@ nfsd(struct svc_rqst *rqstp)
* The main request loop
*/
for (;;) {
- /* Block all but the shutdown signals */
- sigprocmask(SIG_SETMASK, &shutdown_mask, NULL);
-
/*
* Find a socket with data available and call its
* recvfrom routine.
*/
while ((err = svc_recv(rqstp, 60*60*HZ)) == -EAGAIN)
;
- if (err < 0)
+ if (err == -EINTR)
break;
+ else if (err < 0) {
+ if (err != preverr) {
+ printk(KERN_WARNING "%s: unexpected error "
+ "from svc_recv (%d)\n", __func__, -err);
+ preverr = err;
+ }
+ schedule_timeout_uninterruptible(HZ);
+ continue;
+ }
+
update_thread_usage(atomic_read(&nfsd_busy));
atomic_inc(&nfsd_busy);
/* Lock the export hash tables for reading. */
exp_readlock();
- /* Process request with signals blocked. */
- sigprocmask(SIG_SETMASK, &allowed_mask, NULL);
-
svc_process(rqstp);
/* Unlock export hash tables */
@@ -463,22 +474,10 @@ nfsd(struct svc_rqst *rqstp)
atomic_dec(&nfsd_busy);
}
- if (err != -EINTR) {
- printk(KERN_WARNING "nfsd: terminating on error %d\n", -err);
- } else {
- unsigned int signo;
-
- for (signo = 1; signo <= _NSIG; signo++)
- if (sigismember(&current->pending.signal, signo) &&
- !sigismember(&current->blocked, signo))
- break;
- killsig = signo;
- }
/* Clear signals before calling svc_exit_thread() */
flush_signals(current);
- lock_kernel();
-
+ mutex_lock(&nfsd_mutex);
nfsdstats.th_cnt --;
out:
@@ -486,8 +485,9 @@ out:
svc_exit_thread(rqstp);
/* Release module */
- unlock_kernel();
+ mutex_unlock(&nfsd_mutex);
module_put_and_exit(0);
+ return 0;
}
static __be32 map_new_errors(u32 vers, __be32 nfserr)
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index a3a291f771f..aa1d0d6489a 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -83,7 +83,6 @@ struct raparm_hbucket {
spinlock_t pb_lock;
} ____cacheline_aligned_in_smp;
-static struct raparms * raparml;
#define RAPARM_HASH_BITS 4
#define RAPARM_HASH_SIZE (1<<RAPARM_HASH_BITS)
#define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1)
@@ -144,7 +143,7 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name);
/* Obtain dentry and export. */
- err = fh_verify(rqstp, fhp, S_IFDIR, MAY_EXEC);
+ err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
if (err)
return err;
@@ -262,14 +261,14 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
{
struct dentry *dentry;
struct inode *inode;
- int accmode = MAY_SATTR;
+ int accmode = NFSD_MAY_SATTR;
int ftype = 0;
__be32 err;
int host_err;
int size_change = 0;
if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE))
- accmode |= MAY_WRITE|MAY_OWNER_OVERRIDE;
+ accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE;
if (iap->ia_valid & ATTR_SIZE)
ftype = S_IFREG;
@@ -331,7 +330,8 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
*/
if (iap->ia_valid & ATTR_SIZE) {
if (iap->ia_size < inode->i_size) {
- err = nfsd_permission(rqstp, fhp->fh_export, dentry, MAY_TRUNC|MAY_OWNER_OVERRIDE);
+ err = nfsd_permission(rqstp, fhp->fh_export, dentry,
+ NFSD_MAY_TRUNC|NFSD_MAY_OWNER_OVERRIDE);
if (err)
goto out;
}
@@ -462,7 +462,7 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned int flags = 0;
/* Get inode */
- error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, MAY_SATTR);
+ error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, NFSD_MAY_SATTR);
if (error)
return error;
@@ -563,20 +563,20 @@ struct accessmap {
int how;
};
static struct accessmap nfs3_regaccess[] = {
- { NFS3_ACCESS_READ, MAY_READ },
- { NFS3_ACCESS_EXECUTE, MAY_EXEC },
- { NFS3_ACCESS_MODIFY, MAY_WRITE|MAY_TRUNC },
- { NFS3_ACCESS_EXTEND, MAY_WRITE },
+ { NFS3_ACCESS_READ, NFSD_MAY_READ },
+ { NFS3_ACCESS_EXECUTE, NFSD_MAY_EXEC },
+ { NFS3_ACCESS_MODIFY, NFSD_MAY_WRITE|NFSD_MAY_TRUNC },
+ { NFS3_ACCESS_EXTEND, NFSD_MAY_WRITE },
{ 0, 0 }
};
static struct accessmap nfs3_diraccess[] = {
- { NFS3_ACCESS_READ, MAY_READ },
- { NFS3_ACCESS_LOOKUP, MAY_EXEC },
- { NFS3_ACCESS_MODIFY, MAY_EXEC|MAY_WRITE|MAY_TRUNC },
- { NFS3_ACCESS_EXTEND, MAY_EXEC|MAY_WRITE },
- { NFS3_ACCESS_DELETE, MAY_REMOVE },
+ { NFS3_ACCESS_READ, NFSD_MAY_READ },
+ { NFS3_ACCESS_LOOKUP, NFSD_MAY_EXEC },
+ { NFS3_ACCESS_MODIFY, NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC},
+ { NFS3_ACCESS_EXTEND, NFSD_MAY_EXEC|NFSD_MAY_WRITE },
+ { NFS3_ACCESS_DELETE, NFSD_MAY_REMOVE },
{ 0, 0 }
};
@@ -589,10 +589,10 @@ static struct accessmap nfs3_anyaccess[] = {
* mainly at mode bits, and we make sure to ignore read-only
* filesystem checks
*/
- { NFS3_ACCESS_READ, MAY_READ },
- { NFS3_ACCESS_EXECUTE, MAY_EXEC },
- { NFS3_ACCESS_MODIFY, MAY_WRITE|MAY_LOCAL_ACCESS },
- { NFS3_ACCESS_EXTEND, MAY_WRITE|MAY_LOCAL_ACCESS },
+ { NFS3_ACCESS_READ, NFSD_MAY_READ },
+ { NFS3_ACCESS_EXECUTE, NFSD_MAY_EXEC },
+ { NFS3_ACCESS_MODIFY, NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS },
+ { NFS3_ACCESS_EXTEND, NFSD_MAY_WRITE|NFSD_MAY_LOCAL_ACCESS },
{ 0, 0 }
};
@@ -606,7 +606,7 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor
u32 query, result = 0, sresult = 0;
__be32 error;
- error = fh_verify(rqstp, fhp, 0, MAY_NOP);
+ error = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP);
if (error)
goto out;
@@ -678,7 +678,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
* and (hopefully) checked permission - so allow OWNER_OVERRIDE
* in case a chmod has now revoked permission.
*/
- err = fh_verify(rqstp, fhp, type, access | MAY_OWNER_OVERRIDE);
+ err = fh_verify(rqstp, fhp, type, access | NFSD_MAY_OWNER_OVERRIDE);
if (err)
goto out;
@@ -689,7 +689,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
* or any access when mandatory locking enabled
*/
err = nfserr_perm;
- if (IS_APPEND(inode) && (access & MAY_WRITE))
+ if (IS_APPEND(inode) && (access & NFSD_MAY_WRITE))
goto out;
/*
* We must ignore files (but only files) which might have mandatory
@@ -706,14 +706,14 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
* Check to see if there are any leases on this file.
* This may block while leases are broken.
*/
- host_err = break_lease(inode, O_NONBLOCK | ((access & MAY_WRITE) ? FMODE_WRITE : 0));
+ host_err = break_lease(inode, O_NONBLOCK | ((access & NFSD_MAY_WRITE) ? FMODE_WRITE : 0));
if (host_err == -EWOULDBLOCK)
host_err = -ETIMEDOUT;
if (host_err) /* NOMEM or WOULDBLOCK */
goto out_nfserr;
- if (access & MAY_WRITE) {
- if (access & MAY_READ)
+ if (access & NFSD_MAY_WRITE) {
+ if (access & NFSD_MAY_READ)
flags = O_RDWR|O_LARGEFILE;
else
flags = O_WRONLY|O_LARGEFILE;
@@ -1069,12 +1069,12 @@ nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
if (file) {
err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
- MAY_READ|MAY_OWNER_OVERRIDE);
+ NFSD_MAY_READ|NFSD_MAY_OWNER_OVERRIDE);
if (err)
goto out;
err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
} else {
- err = nfsd_open(rqstp, fhp, S_IFREG, MAY_READ, &file);
+ err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
if (err)
goto out;
err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
@@ -1098,13 +1098,13 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
if (file) {
err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
- MAY_WRITE|MAY_OWNER_OVERRIDE);
+ NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE);
if (err)
goto out;
err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt,
stablep);
} else {
- err = nfsd_open(rqstp, fhp, S_IFREG, MAY_WRITE, &file);
+ err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file);
if (err)
goto out;
@@ -1136,7 +1136,8 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
if ((u64)count > ~(u64)offset)
return nfserr_inval;
- if ((err = nfsd_open(rqstp, fhp, S_IFREG, MAY_WRITE, &file)) != 0)
+ err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file);
+ if (err)
return err;
if (EX_ISSYNC(fhp->fh_export)) {
if (file->f_op && file->f_op->fsync) {
@@ -1197,7 +1198,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (isdotent(fname, flen))
goto out;
- err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE);
+ err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
if (err)
goto out;
@@ -1248,36 +1249,34 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
iap->ia_mode = 0;
iap->ia_mode = (iap->ia_mode & S_IALLUGO) | type;
+ err = nfserr_inval;
+ if (!S_ISREG(type) && !S_ISDIR(type) && !special_file(type)) {
+ printk(KERN_WARNING "nfsd: bad file type %o in nfsd_create\n",
+ type);
+ goto out;
+ }
+
+ host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
+ if (host_err)
+ goto out_nfserr;
+
/*
* Get the dir op function pointer.
*/
err = 0;
switch (type) {
case S_IFREG:
- host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
- if (host_err)
- goto out_nfserr;
host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
break;
case S_IFDIR:
- host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
- if (host_err)
- goto out_nfserr;
host_err = vfs_mkdir(dirp, dchild, iap->ia_mode);
break;
case S_IFCHR:
case S_IFBLK:
case S_IFIFO:
case S_IFSOCK:
- host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
- if (host_err)
- goto out_nfserr;
host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev);
break;
- default:
- printk("nfsd: bad file type %o in nfsd_create\n", type);
- host_err = -EINVAL;
- goto out_nfserr;
}
if (host_err < 0) {
mnt_drop_write(fhp->fh_export->ex_path.mnt);
@@ -1289,7 +1288,6 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
write_inode_now(dchild->d_inode, 1);
}
-
err2 = nfsd_create_setattr(rqstp, resfhp, iap);
if (err2)
err = err2;
@@ -1334,7 +1332,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
goto out;
if (!(iap->ia_valid & ATTR_MODE))
iap->ia_mode = 0;
- err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE);
+ err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
if (err)
goto out;
@@ -1471,7 +1469,7 @@ nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
__be32 err;
int host_err;
- err = fh_verify(rqstp, fhp, S_IFLNK, MAY_NOP);
+ err = fh_verify(rqstp, fhp, S_IFLNK, NFSD_MAY_NOP);
if (err)
goto out;
@@ -1517,7 +1515,6 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct dentry *dentry, *dnew;
__be32 err, cerr;
int host_err;
- umode_t mode;
err = nfserr_noent;
if (!flen || !plen)
@@ -1526,7 +1523,7 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (isdotent(fname, flen))
goto out;
- err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE);
+ err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
if (err)
goto out;
fh_lock(fhp);
@@ -1536,11 +1533,6 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (IS_ERR(dnew))
goto out_nfserr;
- mode = S_IALLUGO;
- /* Only the MODE ATTRibute is even vaguely meaningful */
- if (iap && (iap->ia_valid & ATTR_MODE))
- mode = iap->ia_mode & S_IALLUGO;
-
host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
if (host_err)
goto out_nfserr;
@@ -1552,11 +1544,11 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
else {
strncpy(path_alloced, path, plen);
path_alloced[plen] = 0;
- host_err = vfs_symlink(dentry->d_inode, dnew, path_alloced, mode);
+ host_err = vfs_symlink(dentry->d_inode, dnew, path_alloced);
kfree(path_alloced);
}
} else
- host_err = vfs_symlink(dentry->d_inode, dnew, path, mode);
+ host_err = vfs_symlink(dentry->d_inode, dnew, path);
if (!host_err) {
if (EX_ISSYNC(fhp->fh_export))
@@ -1591,10 +1583,10 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
__be32 err;
int host_err;
- err = fh_verify(rqstp, ffhp, S_IFDIR, MAY_CREATE);
+ err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_CREATE);
if (err)
goto out;
- err = fh_verify(rqstp, tfhp, -S_IFDIR, MAY_NOP);
+ err = fh_verify(rqstp, tfhp, -S_IFDIR, NFSD_MAY_NOP);
if (err)
goto out;
@@ -1661,10 +1653,10 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
__be32 err;
int host_err;
- err = fh_verify(rqstp, ffhp, S_IFDIR, MAY_REMOVE);
+ err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE);
if (err)
goto out;
- err = fh_verify(rqstp, tfhp, S_IFDIR, MAY_CREATE);
+ err = fh_verify(rqstp, tfhp, S_IFDIR, NFSD_MAY_CREATE);
if (err)
goto out;
@@ -1768,7 +1760,7 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
err = nfserr_acces;
if (!flen || isdotent(fname, flen))
goto out;
- err = fh_verify(rqstp, fhp, S_IFDIR, MAY_REMOVE);
+ err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_REMOVE);
if (err)
goto out;
@@ -1834,7 +1826,7 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp,
struct file *file;
loff_t offset = *offsetp;
- err = nfsd_open(rqstp, fhp, S_IFDIR, MAY_READ, &file);
+ err = nfsd_open(rqstp, fhp, S_IFDIR, NFSD_MAY_READ, &file);
if (err)
goto out;
@@ -1873,9 +1865,9 @@ out:
* N.B. After this call fhp needs an fh_put
*/
__be32
-nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat)
+nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access)
{
- __be32 err = fh_verify(rqstp, fhp, 0, MAY_NOP);
+ __be32 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access);
if (!err && vfs_statfs(fhp->fh_dentry,stat))
err = nfserr_io;
return err;
@@ -1896,18 +1888,18 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
struct inode *inode = dentry->d_inode;
int err;
- if (acc == MAY_NOP)
+ if (acc == NFSD_MAY_NOP)
return 0;
#if 0
dprintk("nfsd: permission 0x%x%s%s%s%s%s%s%s mode 0%o%s%s%s\n",
acc,
- (acc & MAY_READ)? " read" : "",
- (acc & MAY_WRITE)? " write" : "",
- (acc & MAY_EXEC)? " exec" : "",
- (acc & MAY_SATTR)? " sattr" : "",
- (acc & MAY_TRUNC)? " trunc" : "",
- (acc & MAY_LOCK)? " lock" : "",
- (acc & MAY_OWNER_OVERRIDE)? " owneroverride" : "",
+ (acc & NFSD_MAY_READ)? " read" : "",
+ (acc & NFSD_MAY_WRITE)? " write" : "",
+ (acc & NFSD_MAY_EXEC)? " exec" : "",
+ (acc & NFSD_MAY_SATTR)? " sattr" : "",
+ (acc & NFSD_MAY_TRUNC)? " trunc" : "",
+ (acc & NFSD_MAY_LOCK)? " lock" : "",
+ (acc & NFSD_MAY_OWNER_OVERRIDE)? " owneroverride" : "",
inode->i_mode,
IS_IMMUTABLE(inode)? " immut" : "",
IS_APPEND(inode)? " append" : "",
@@ -1920,18 +1912,18 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
* system. But if it is IRIX doing check on write-access for a
* device special file, we ignore rofs.
*/
- if (!(acc & MAY_LOCAL_ACCESS))
- if (acc & (MAY_WRITE | MAY_SATTR | MAY_TRUNC)) {
+ if (!(acc & NFSD_MAY_LOCAL_ACCESS))
+ if (acc & (NFSD_MAY_WRITE | NFSD_MAY_SATTR | NFSD_MAY_TRUNC)) {
if (exp_rdonly(rqstp, exp) ||
__mnt_is_readonly(exp->ex_path.mnt))
return nfserr_rofs;
- if (/* (acc & MAY_WRITE) && */ IS_IMMUTABLE(inode))
+ if (/* (acc & NFSD_MAY_WRITE) && */ IS_IMMUTABLE(inode))
return nfserr_perm;
}
- if ((acc & MAY_TRUNC) && IS_APPEND(inode))
+ if ((acc & NFSD_MAY_TRUNC) && IS_APPEND(inode))
return nfserr_perm;
- if (acc & MAY_LOCK) {
+ if (acc & NFSD_MAY_LOCK) {
/* If we cannot rely on authentication in NLM requests,
* just allow locks, otherwise require read permission, or
* ownership
@@ -1939,7 +1931,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
if (exp->ex_flags & NFSEXP_NOAUTHNLM)
return 0;
else
- acc = MAY_READ | MAY_OWNER_OVERRIDE;
+ acc = NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE;
}
/*
* The file owner always gets access permission for accesses that
@@ -1955,16 +1947,17 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
* We must trust the client to do permission checking - using "ACCESS"
* with NFSv3.
*/
- if ((acc & MAY_OWNER_OVERRIDE) &&
+ if ((acc & NFSD_MAY_OWNER_OVERRIDE) &&
inode->i_uid == current->fsuid)
return 0;
- err = permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC), NULL);
+ /* This assumes NFSD_MAY_{READ,WRITE,EXEC} == MAY_{READ,WRITE,EXEC} */
+ err = inode_permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC));
/* Allow read access to binaries even when mode 111 */
if (err == -EACCES && S_ISREG(inode->i_mode) &&
- acc == (MAY_READ | MAY_OWNER_OVERRIDE))
- err = permission(inode, MAY_EXEC, NULL);
+ acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE))
+ err = inode_permission(inode, MAY_EXEC);
return err? nfserrno(err) : 0;
}
@@ -1972,11 +1965,20 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
void
nfsd_racache_shutdown(void)
{
- if (!raparml)
- return;
+ struct raparms *raparm, *last_raparm;
+ unsigned int i;
+
dprintk("nfsd: freeing readahead buffers.\n");
- kfree(raparml);
- raparml = NULL;
+
+ for (i = 0; i < RAPARM_HASH_SIZE; i++) {
+ raparm = raparm_hash[i].pb_head;
+ while(raparm) {
+ last_raparm = raparm;
+ raparm = raparm->p_next;
+ kfree(last_raparm);
+ }
+ raparm_hash[i].pb_head = NULL;
+ }
}
/*
* Initialize readahead param cache
@@ -1987,35 +1989,38 @@ nfsd_racache_init(int cache_size)
int i;
int j = 0;
int nperbucket;
+ struct raparms **raparm = NULL;
- if (raparml)
+ if (raparm_hash[0].pb_head)
return 0;
- if (cache_size < 2*RAPARM_HASH_SIZE)
- cache_size = 2*RAPARM_HASH_SIZE;
- raparml = kcalloc(cache_size, sizeof(struct raparms), GFP_KERNEL);
-
- if (!raparml) {
- printk(KERN_WARNING
- "nfsd: Could not allocate memory read-ahead cache.\n");
- return -ENOMEM;
- }
+ nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE);
+ if (nperbucket < 2)
+ nperbucket = 2;
+ cache_size = nperbucket * RAPARM_HASH_SIZE;
dprintk("nfsd: allocating %d readahead buffers.\n", cache_size);
- for (i = 0 ; i < RAPARM_HASH_SIZE ; i++) {
- raparm_hash[i].pb_head = NULL;
+
+ for (i = 0; i < RAPARM_HASH_SIZE; i++) {
spin_lock_init(&raparm_hash[i].pb_lock);
- }
- nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE);
- for (i = 0; i < cache_size - 1; i++) {
- if (i % nperbucket == 0)
- raparm_hash[j++].pb_head = raparml + i;
- if (i % nperbucket < nperbucket-1)
- raparml[i].p_next = raparml + i + 1;
+
+ raparm = &raparm_hash[i].pb_head;
+ for (j = 0; j < nperbucket; j++) {
+ *raparm = kzalloc(sizeof(struct raparms), GFP_KERNEL);
+ if (!*raparm)
+ goto out_nomem;
+ raparm = &(*raparm)->p_next;
+ }
+ *raparm = NULL;
}
nfsdstats.ra_size = cache_size;
return 0;
+
+out_nomem:
+ dprintk("nfsd: kmalloc failed, freeing readahead buffers\n");
+ nfsd_racache_shutdown();
+ return -ENOMEM;
}
#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 00e9ccde8e4..b38f944f066 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1194,7 +1194,7 @@ lock_retry_remap:
tbh = bhs[i];
if (!tbh)
continue;
- if (unlikely(test_set_buffer_locked(tbh)))
+ if (!trylock_buffer(tbh))
BUG();
/* The buffer dirty state is now irrelevant, just clean it. */
clear_buffer_dirty(tbh);
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index 33ff314cc50..9669541d011 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -665,7 +665,7 @@ lock_retry_remap:
for (i = 0; i < nr_bhs; i++) {
struct buffer_head *tbh = bhs[i];
- if (unlikely(test_set_buffer_locked(tbh)))
+ if (!trylock_buffer(tbh))
continue;
if (unlikely(buffer_uptodate(tbh))) {
unlock_buffer(tbh);
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 3c5550cd11d..d020866d423 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2118,7 +2118,7 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
goto out;
if (!count)
goto out;
- err = remove_suid(file->f_path.dentry);
+ err = file_remove_suid(file);
if (err)
goto out;
file_update_time(file);
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 790defb847e..17d32ca6bc3 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -586,7 +586,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
struct buffer_head *tbh = bhs[i_bhs];
- if (unlikely(test_set_buffer_locked(tbh)))
+ if (!trylock_buffer(tbh))
BUG();
BUG_ON(!buffer_uptodate(tbh));
clear_buffer_dirty(tbh);
@@ -779,7 +779,7 @@ int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync)
for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
struct buffer_head *tbh = bhs[i_bhs];
- if (unlikely(test_set_buffer_locked(tbh)))
+ if (!trylock_buffer(tbh))
BUG();
BUG_ON(!buffer_uptodate(tbh));
clear_buffer_dirty(tbh);
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index e1781c8b165..9e8a95be7a1 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -174,7 +174,6 @@ static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent,
// TODO: Consider moving this lot to a separate function! (AIA)
handle_name:
{
- struct dentry *real_dent, *new_dent;
MFT_RECORD *m;
ntfs_attr_search_ctx *ctx;
ntfs_inode *ni = NTFS_I(dent_inode);
@@ -255,93 +254,9 @@ handle_name:
}
nls_name.hash = full_name_hash(nls_name.name, nls_name.len);
- /*
- * Note: No need for dent->d_lock lock as i_mutex is held on the
- * parent inode.
- */
-
- /* Does a dentry matching the nls_name exist already? */
- real_dent = d_lookup(dent->d_parent, &nls_name);
- /* If not, create it now. */
- if (!real_dent) {
- real_dent = d_alloc(dent->d_parent, &nls_name);
- kfree(nls_name.name);
- if (!real_dent) {
- err = -ENOMEM;
- goto err_out;
- }
- new_dent = d_splice_alias(dent_inode, real_dent);
- if (new_dent)
- dput(real_dent);
- else
- new_dent = real_dent;
- ntfs_debug("Done. (Created new dentry.)");
- return new_dent;
- }
+ dent = d_add_ci(dent, dent_inode, &nls_name);
kfree(nls_name.name);
- /* Matching dentry exists, check if it is negative. */
- if (real_dent->d_inode) {
- if (unlikely(real_dent->d_inode != dent_inode)) {
- /* This can happen because bad inodes are unhashed. */
- BUG_ON(!is_bad_inode(dent_inode));
- BUG_ON(!is_bad_inode(real_dent->d_inode));
- }
- /*
- * Already have the inode and the dentry attached, decrement
- * the reference count to balance the ntfs_iget() we did
- * earlier on. We found the dentry using d_lookup() so it
- * cannot be disconnected and thus we do not need to worry
- * about any NFS/disconnectedness issues here.
- */
- iput(dent_inode);
- ntfs_debug("Done. (Already had inode and dentry.)");
- return real_dent;
- }
- /*
- * Negative dentry: instantiate it unless the inode is a directory and
- * has a 'disconnected' dentry (i.e. IS_ROOT and DCACHE_DISCONNECTED),
- * in which case d_move() that in place of the found dentry.
- */
- if (!S_ISDIR(dent_inode->i_mode)) {
- /* Not a directory; everything is easy. */
- d_instantiate(real_dent, dent_inode);
- ntfs_debug("Done. (Already had negative file dentry.)");
- return real_dent;
- }
- spin_lock(&dcache_lock);
- if (list_empty(&dent_inode->i_dentry)) {
- /*
- * Directory without a 'disconnected' dentry; we need to do
- * d_instantiate() by hand because it takes dcache_lock which
- * we already hold.
- */
- list_add(&real_dent->d_alias, &dent_inode->i_dentry);
- real_dent->d_inode = dent_inode;
- spin_unlock(&dcache_lock);
- security_d_instantiate(real_dent, dent_inode);
- ntfs_debug("Done. (Already had negative directory dentry.)");
- return real_dent;
- }
- /*
- * Directory with a 'disconnected' dentry; get a reference to the
- * 'disconnected' dentry.
- */
- new_dent = list_entry(dent_inode->i_dentry.next, struct dentry,
- d_alias);
- dget_locked(new_dent);
- spin_unlock(&dcache_lock);
- /* Do security vodoo. */
- security_d_instantiate(real_dent, dent_inode);
- /* Move new_dent in place of real_dent. */
- d_move(new_dent, real_dent);
- /* Balance the ntfs_iget() we did above. */
- iput(dent_inode);
- /* Throw away real_dent. */
- dput(real_dent);
- /* Use new_dent as the actual dentry. */
- ntfs_debug("Done. (Already had negative, disconnected directory "
- "dentry.)");
- return new_dent;
+ return dent;
eio_err_out:
ntfs_error(vol->sb, "Illegal file name attribute. Run chkdsk.");
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 3e76f3b216b..4a46743b507 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -3080,7 +3080,7 @@ struct kmem_cache *ntfs_inode_cache;
struct kmem_cache *ntfs_big_inode_cache;
/* Init once constructor for the inode slab cache. */
-static void ntfs_big_inode_init_once(struct kmem_cache *cachep, void *foo)
+static void ntfs_big_inode_init_once(void *foo)
{
ntfs_inode *ni = (ntfs_inode *)foo;
diff --git a/fs/ntfs/usnjrnl.h b/fs/ntfs/usnjrnl.h
index 3a8af75351e..4087fbdac32 100644
--- a/fs/ntfs/usnjrnl.h
+++ b/fs/ntfs/usnjrnl.h
@@ -113,7 +113,7 @@ typedef struct {
* Reason flags (32-bit). Cumulative flags describing the change(s) to the
* file since it was last opened. I think the names speak for themselves but
* if you disagree check out the descriptions in the Linux NTFS project NTFS
- * documentation: http://linux-ntfs.sourceforge.net/ntfs/files/usnjrnl.html
+ * documentation: http://www.linux-ntfs.org/
*/
enum {
USN_REASON_DATA_OVERWRITE = const_cpu_to_le32(0x00000001),
@@ -145,7 +145,7 @@ typedef le32 USN_REASON_FLAGS;
* Source info flags (32-bit). Information about the source of the change(s)
* to the file. For detailed descriptions of what these mean, see the Linux
* NTFS project NTFS documentation:
- * http://linux-ntfs.sourceforge.net/ntfs/files/usnjrnl.html
+ * http://www.linux-ntfs.org/
*/
enum {
USN_SOURCE_DATA_MANAGEMENT = const_cpu_to_le32(0x00000001),
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index f6956de56fd..589dcdfdfe3 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -34,7 +34,8 @@ ocfs2-objs := \
symlink.o \
sysfile.o \
uptodate.o \
- ver.o
+ ver.o \
+ xattr.o
ocfs2_stackglue-objs := stackglue.o
ocfs2_stack_o2cb-objs := stack_o2cb.o
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 10bfb466e06..0cc2deb9394 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -49,6 +49,340 @@
#include "buffer_head_io.h"
+
+/*
+ * Operations for a specific extent tree type.
+ *
+ * To implement an on-disk btree (extent tree) type in ocfs2, add
+ * an ocfs2_extent_tree_operations structure and the matching
+ * ocfs2_init_<thingy>_extent_tree() function. That's pretty much it
+ * for the allocation portion of the extent tree.
+ */
+struct ocfs2_extent_tree_operations {
+ /*
+ * last_eb_blk is the block number of the right most leaf extent
+ * block. Most on-disk structures containing an extent tree store
+ * this value for fast access. The ->eo_set_last_eb_blk() and
+ * ->eo_get_last_eb_blk() operations access this value. They are
+ * both required.
+ */
+ void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et,
+ u64 blkno);
+ u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et);
+
+ /*
+ * The on-disk structure usually keeps track of how many total
+ * clusters are stored in this extent tree. This function updates
+ * that value. new_clusters is the delta, and must be
+ * added to the total. Required.
+ */
+ void (*eo_update_clusters)(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ u32 new_clusters);
+
+ /*
+ * If ->eo_insert_check() exists, it is called before rec is
+ * inserted into the extent tree. It is optional.
+ */
+ int (*eo_insert_check)(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *rec);
+ int (*eo_sanity_check)(struct inode *inode, struct ocfs2_extent_tree *et);
+
+ /*
+ * --------------------------------------------------------------
+ * The remaining are internal to ocfs2_extent_tree and don't have
+ * accessor functions
+ */
+
+ /*
+ * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el.
+ * It is required.
+ */
+ void (*eo_fill_root_el)(struct ocfs2_extent_tree *et);
+
+ /*
+ * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if
+ * it exists. If it does not, et->et_max_leaf_clusters is set
+ * to 0 (unlimited). Optional.
+ */
+ void (*eo_fill_max_leaf_clusters)(struct inode *inode,
+ struct ocfs2_extent_tree *et);
+};
+
+
+/*
+ * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check
+ * in the methods.
+ */
+static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et);
+static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
+ u64 blkno);
+static void ocfs2_dinode_update_clusters(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ u32 clusters);
+static int ocfs2_dinode_insert_check(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *rec);
+static int ocfs2_dinode_sanity_check(struct inode *inode,
+ struct ocfs2_extent_tree *et);
+static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
+static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
+ .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk,
+ .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk,
+ .eo_update_clusters = ocfs2_dinode_update_clusters,
+ .eo_insert_check = ocfs2_dinode_insert_check,
+ .eo_sanity_check = ocfs2_dinode_sanity_check,
+ .eo_fill_root_el = ocfs2_dinode_fill_root_el,
+};
+
+static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
+ u64 blkno)
+{
+ struct ocfs2_dinode *di = et->et_object;
+
+ BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
+ di->i_last_eb_blk = cpu_to_le64(blkno);
+}
+
+static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+ struct ocfs2_dinode *di = et->et_object;
+
+ BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
+ return le64_to_cpu(di->i_last_eb_blk);
+}
+
+static void ocfs2_dinode_update_clusters(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ u32 clusters)
+{
+ struct ocfs2_dinode *di = et->et_object;
+
+ le32_add_cpu(&di->i_clusters, clusters);
+ spin_lock(&OCFS2_I(inode)->ip_lock);
+ OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
+ spin_unlock(&OCFS2_I(inode)->ip_lock);
+}
+
+static int ocfs2_dinode_insert_check(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *rec)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
+ mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
+ (OCFS2_I(inode)->ip_clusters != rec->e_cpos),
+ "Device %s, asking for sparse allocation: inode %llu, "
+ "cpos %u, clusters %u\n",
+ osb->dev_str,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ rec->e_cpos,
+ OCFS2_I(inode)->ip_clusters);
+
+ return 0;
+}
+
+static int ocfs2_dinode_sanity_check(struct inode *inode,
+ struct ocfs2_extent_tree *et)
+{
+ int ret = 0;
+ struct ocfs2_dinode *di;
+
+ BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
+
+ di = et->et_object;
+ if (!OCFS2_IS_VALID_DINODE(di)) {
+ ret = -EIO;
+ ocfs2_error(inode->i_sb,
+ "Inode %llu has invalid path root",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ }
+
+ return ret;
+}
+
+static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
+{
+ struct ocfs2_dinode *di = et->et_object;
+
+ et->et_root_el = &di->id2.i_list;
+}
+
+
+static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
+{
+ struct ocfs2_xattr_value_root *xv = et->et_object;
+
+ et->et_root_el = &xv->xr_list;
+}
+
+static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
+ u64 blkno)
+{
+ struct ocfs2_xattr_value_root *xv =
+ (struct ocfs2_xattr_value_root *)et->et_object;
+
+ xv->xr_last_eb_blk = cpu_to_le64(blkno);
+}
+
+static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+ struct ocfs2_xattr_value_root *xv =
+ (struct ocfs2_xattr_value_root *) et->et_object;
+
+ return le64_to_cpu(xv->xr_last_eb_blk);
+}
+
+static void ocfs2_xattr_value_update_clusters(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ u32 clusters)
+{
+ struct ocfs2_xattr_value_root *xv =
+ (struct ocfs2_xattr_value_root *)et->et_object;
+
+ le32_add_cpu(&xv->xr_clusters, clusters);
+}
+
+static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
+ .eo_set_last_eb_blk = ocfs2_xattr_value_set_last_eb_blk,
+ .eo_get_last_eb_blk = ocfs2_xattr_value_get_last_eb_blk,
+ .eo_update_clusters = ocfs2_xattr_value_update_clusters,
+ .eo_fill_root_el = ocfs2_xattr_value_fill_root_el,
+};
+
+static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et)
+{
+ struct ocfs2_xattr_block *xb = et->et_object;
+
+ et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
+}
+
+static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct inode *inode,
+ struct ocfs2_extent_tree *et)
+{
+ et->et_max_leaf_clusters =
+ ocfs2_clusters_for_bytes(inode->i_sb,
+ OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
+}
+
+static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
+ u64 blkno)
+{
+ struct ocfs2_xattr_block *xb = et->et_object;
+ struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
+
+ xt->xt_last_eb_blk = cpu_to_le64(blkno);
+}
+
+static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+ struct ocfs2_xattr_block *xb = et->et_object;
+ struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
+
+ return le64_to_cpu(xt->xt_last_eb_blk);
+}
+
+static void ocfs2_xattr_tree_update_clusters(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ u32 clusters)
+{
+ struct ocfs2_xattr_block *xb = et->et_object;
+
+ le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
+}
+
+static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
+ .eo_set_last_eb_blk = ocfs2_xattr_tree_set_last_eb_blk,
+ .eo_get_last_eb_blk = ocfs2_xattr_tree_get_last_eb_blk,
+ .eo_update_clusters = ocfs2_xattr_tree_update_clusters,
+ .eo_fill_root_el = ocfs2_xattr_tree_fill_root_el,
+ .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
+};
+
+static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
+ struct inode *inode,
+ struct buffer_head *bh,
+ void *obj,
+ struct ocfs2_extent_tree_operations *ops)
+{
+ et->et_ops = ops;
+ et->et_root_bh = bh;
+ if (!obj)
+ obj = (void *)bh->b_data;
+ et->et_object = obj;
+
+ et->et_ops->eo_fill_root_el(et);
+ if (!et->et_ops->eo_fill_max_leaf_clusters)
+ et->et_max_leaf_clusters = 0;
+ else
+ et->et_ops->eo_fill_max_leaf_clusters(inode, et);
+}
+
+void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
+ struct inode *inode,
+ struct buffer_head *bh)
+{
+ __ocfs2_init_extent_tree(et, inode, bh, NULL, &ocfs2_dinode_et_ops);
+}
+
+void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
+ struct inode *inode,
+ struct buffer_head *bh)
+{
+ __ocfs2_init_extent_tree(et, inode, bh, NULL,
+ &ocfs2_xattr_tree_et_ops);
+}
+
+void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
+ struct inode *inode,
+ struct buffer_head *bh,
+ struct ocfs2_xattr_value_root *xv)
+{
+ __ocfs2_init_extent_tree(et, inode, bh, xv,
+ &ocfs2_xattr_value_et_ops);
+}
+
+static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
+ u64 new_last_eb_blk)
+{
+ et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk);
+}
+
+static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et)
+{
+ return et->et_ops->eo_get_last_eb_blk(et);
+}
+
+static inline void ocfs2_et_update_clusters(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ u32 clusters)
+{
+ et->et_ops->eo_update_clusters(inode, et, clusters);
+}
+
+static inline int ocfs2_et_insert_check(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ struct ocfs2_extent_rec *rec)
+{
+ int ret = 0;
+
+ if (et->et_ops->eo_insert_check)
+ ret = et->et_ops->eo_insert_check(inode, et, rec);
+ return ret;
+}
+
+static inline int ocfs2_et_sanity_check(struct inode *inode,
+ struct ocfs2_extent_tree *et)
+{
+ int ret = 0;
+
+ if (et->et_ops->eo_sanity_check)
+ ret = et->et_ops->eo_sanity_check(inode, et);
+ return ret;
+}
+
static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
struct ocfs2_extent_block *eb);
@@ -205,17 +539,6 @@ static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
}
/*
- * Allocate and initialize a new path based on a disk inode tree.
- */
-static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh)
-{
- struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
- struct ocfs2_extent_list *el = &di->id2.i_list;
-
- return ocfs2_new_path(di_bh, el);
-}
-
-/*
* Convenience function to journal all components in a path.
*/
static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
@@ -368,39 +691,35 @@ struct ocfs2_merge_ctxt {
*/
int ocfs2_num_free_extents(struct ocfs2_super *osb,
struct inode *inode,
- struct ocfs2_dinode *fe)
+ struct ocfs2_extent_tree *et)
{
int retval;
- struct ocfs2_extent_list *el;
+ struct ocfs2_extent_list *el = NULL;
struct ocfs2_extent_block *eb;
struct buffer_head *eb_bh = NULL;
+ u64 last_eb_blk = 0;
mlog_entry_void();
- if (!OCFS2_IS_VALID_DINODE(fe)) {
- OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
- retval = -EIO;
- goto bail;
- }
+ el = et->et_root_el;
+ last_eb_blk = ocfs2_et_get_last_eb_blk(et);
- if (fe->i_last_eb_blk) {
- retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
- &eb_bh, OCFS2_BH_CACHED, inode);
+ if (last_eb_blk) {
+ retval = ocfs2_read_block(inode, last_eb_blk,
+ &eb_bh);
if (retval < 0) {
mlog_errno(retval);
goto bail;
}
eb = (struct ocfs2_extent_block *) eb_bh->b_data;
el = &eb->h_list;
- } else
- el = &fe->id2.i_list;
+ }
BUG_ON(el->l_tree_depth != 0);
retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
bail:
- if (eb_bh)
- brelse(eb_bh);
+ brelse(eb_bh);
mlog_exit(retval);
return retval;
@@ -486,8 +805,7 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
bail:
if (status < 0) {
for(i = 0; i < wanted; i++) {
- if (bhs[i])
- brelse(bhs[i]);
+ brelse(bhs[i]);
bhs[i] = NULL;
}
}
@@ -531,7 +849,7 @@ static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el)
static int ocfs2_add_branch(struct ocfs2_super *osb,
handle_t *handle,
struct inode *inode,
- struct buffer_head *fe_bh,
+ struct ocfs2_extent_tree *et,
struct buffer_head *eb_bh,
struct buffer_head **last_eb_bh,
struct ocfs2_alloc_context *meta_ac)
@@ -540,7 +858,6 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
u64 next_blkno, new_last_eb_blk;
struct buffer_head *bh;
struct buffer_head **new_eb_bhs = NULL;
- struct ocfs2_dinode *fe;
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *eb_el;
struct ocfs2_extent_list *el;
@@ -550,13 +867,11 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
BUG_ON(!last_eb_bh || !*last_eb_bh);
- fe = (struct ocfs2_dinode *) fe_bh->b_data;
-
if (eb_bh) {
eb = (struct ocfs2_extent_block *) eb_bh->b_data;
el = &eb->h_list;
} else
- el = &fe->id2.i_list;
+ el = et->et_root_el;
/* we never add a branch to a leaf. */
BUG_ON(!el->l_tree_depth);
@@ -646,7 +961,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
mlog_errno(status);
goto bail;
}
- status = ocfs2_journal_access(handle, inode, fe_bh,
+ status = ocfs2_journal_access(handle, inode, et->et_root_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -662,7 +977,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
}
/* Link the new branch into the rest of the tree (el will
- * either be on the fe, or the extent block passed in. */
+ * either be on the root_bh, or the extent block passed in. */
i = le16_to_cpu(el->l_next_free_rec);
el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
@@ -671,7 +986,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
/* fe needs a new last extent block pointer, as does the
* next_leaf on the previously last-extent-block. */
- fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
+ ocfs2_et_set_last_eb_blk(et, new_last_eb_blk);
eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
@@ -679,7 +994,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
status = ocfs2_journal_dirty(handle, *last_eb_bh);
if (status < 0)
mlog_errno(status);
- status = ocfs2_journal_dirty(handle, fe_bh);
+ status = ocfs2_journal_dirty(handle, et->et_root_bh);
if (status < 0)
mlog_errno(status);
if (eb_bh) {
@@ -700,8 +1015,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
bail:
if (new_eb_bhs) {
for (i = 0; i < new_blocks; i++)
- if (new_eb_bhs[i])
- brelse(new_eb_bhs[i]);
+ brelse(new_eb_bhs[i]);
kfree(new_eb_bhs);
}
@@ -717,16 +1031,15 @@ bail:
static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
handle_t *handle,
struct inode *inode,
- struct buffer_head *fe_bh,
+ struct ocfs2_extent_tree *et,
struct ocfs2_alloc_context *meta_ac,
struct buffer_head **ret_new_eb_bh)
{
int status, i;
u32 new_clusters;
struct buffer_head *new_eb_bh = NULL;
- struct ocfs2_dinode *fe;
struct ocfs2_extent_block *eb;
- struct ocfs2_extent_list *fe_el;
+ struct ocfs2_extent_list *root_el;
struct ocfs2_extent_list *eb_el;
mlog_entry_void();
@@ -746,8 +1059,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
}
eb_el = &eb->h_list;
- fe = (struct ocfs2_dinode *) fe_bh->b_data;
- fe_el = &fe->id2.i_list;
+ root_el = et->et_root_el;
status = ocfs2_journal_access(handle, inode, new_eb_bh,
OCFS2_JOURNAL_ACCESS_CREATE);
@@ -756,11 +1068,11 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
goto bail;
}
- /* copy the fe data into the new extent block */
- eb_el->l_tree_depth = fe_el->l_tree_depth;
- eb_el->l_next_free_rec = fe_el->l_next_free_rec;
- for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
- eb_el->l_recs[i] = fe_el->l_recs[i];
+ /* copy the root extent list data into the new extent block */
+ eb_el->l_tree_depth = root_el->l_tree_depth;
+ eb_el->l_next_free_rec = root_el->l_next_free_rec;
+ for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
+ eb_el->l_recs[i] = root_el->l_recs[i];
status = ocfs2_journal_dirty(handle, new_eb_bh);
if (status < 0) {
@@ -768,7 +1080,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
goto bail;
}
- status = ocfs2_journal_access(handle, inode, fe_bh,
+ status = ocfs2_journal_access(handle, inode, et->et_root_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -777,21 +1089,21 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
new_clusters = ocfs2_sum_rightmost_rec(eb_el);
- /* update fe now */
- le16_add_cpu(&fe_el->l_tree_depth, 1);
- fe_el->l_recs[0].e_cpos = 0;
- fe_el->l_recs[0].e_blkno = eb->h_blkno;
- fe_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
- for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
- memset(&fe_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
- fe_el->l_next_free_rec = cpu_to_le16(1);
+ /* update root_bh now */
+ le16_add_cpu(&root_el->l_tree_depth, 1);
+ root_el->l_recs[0].e_cpos = 0;
+ root_el->l_recs[0].e_blkno = eb->h_blkno;
+ root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
+ for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
+ memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
+ root_el->l_next_free_rec = cpu_to_le16(1);
/* If this is our 1st tree depth shift, then last_eb_blk
* becomes the allocated extent block */
- if (fe_el->l_tree_depth == cpu_to_le16(1))
- fe->i_last_eb_blk = eb->h_blkno;
+ if (root_el->l_tree_depth == cpu_to_le16(1))
+ ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
- status = ocfs2_journal_dirty(handle, fe_bh);
+ status = ocfs2_journal_dirty(handle, et->et_root_bh);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -801,8 +1113,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
new_eb_bh = NULL;
status = 0;
bail:
- if (new_eb_bh)
- brelse(new_eb_bh);
+ brelse(new_eb_bh);
mlog_exit(status);
return status;
@@ -817,22 +1128,21 @@ bail:
* 1) a lowest extent block is found, then we pass it back in
* *lowest_eb_bh and return '0'
*
- * 2) the search fails to find anything, but the dinode has room. We
+ * 2) the search fails to find anything, but the root_el has room. We
* pass NULL back in *lowest_eb_bh, but still return '0'
*
- * 3) the search fails to find anything AND the dinode is full, in
+ * 3) the search fails to find anything AND the root_el is full, in
* which case we return > 0
*
* return status < 0 indicates an error.
*/
static int ocfs2_find_branch_target(struct ocfs2_super *osb,
struct inode *inode,
- struct buffer_head *fe_bh,
+ struct ocfs2_extent_tree *et,
struct buffer_head **target_bh)
{
int status = 0, i;
u64 blkno;
- struct ocfs2_dinode *fe;
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el;
struct buffer_head *bh = NULL;
@@ -842,8 +1152,7 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
*target_bh = NULL;
- fe = (struct ocfs2_dinode *) fe_bh->b_data;
- el = &fe->id2.i_list;
+ el = et->et_root_el;
while(le16_to_cpu(el->l_tree_depth) > 1) {
if (le16_to_cpu(el->l_next_free_rec) == 0) {
@@ -864,13 +1173,10 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
goto bail;
}
- if (bh) {
- brelse(bh);
- bh = NULL;
- }
+ brelse(bh);
+ bh = NULL;
- status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
- inode);
+ status = ocfs2_read_block(inode, blkno, &bh);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -886,8 +1192,7 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
if (le16_to_cpu(el->l_next_free_rec) <
le16_to_cpu(el->l_count)) {
- if (lowest_bh)
- brelse(lowest_bh);
+ brelse(lowest_bh);
lowest_bh = bh;
get_bh(lowest_bh);
}
@@ -895,14 +1200,13 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
/* If we didn't find one and the fe doesn't have any room,
* then return '1' */
- if (!lowest_bh
- && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count))
+ el = et->et_root_el;
+ if (!lowest_bh && (el->l_next_free_rec == el->l_count))
status = 1;
*target_bh = lowest_bh;
bail:
- if (bh)
- brelse(bh);
+ brelse(bh);
mlog_exit(status);
return status;
@@ -919,19 +1223,19 @@ bail:
* *last_eb_bh will be updated by ocfs2_add_branch().
*/
static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
- struct buffer_head *di_bh, int *final_depth,
+ struct ocfs2_extent_tree *et, int *final_depth,
struct buffer_head **last_eb_bh,
struct ocfs2_alloc_context *meta_ac)
{
int ret, shift;
- struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
- int depth = le16_to_cpu(di->id2.i_list.l_tree_depth);
+ struct ocfs2_extent_list *el = et->et_root_el;
+ int depth = le16_to_cpu(el->l_tree_depth);
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct buffer_head *bh = NULL;
BUG_ON(meta_ac == NULL);
- shift = ocfs2_find_branch_target(osb, inode, di_bh, &bh);
+ shift = ocfs2_find_branch_target(osb, inode, et, &bh);
if (shift < 0) {
ret = shift;
mlog_errno(ret);
@@ -948,7 +1252,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
/* ocfs2_shift_tree_depth will return us a buffer with
* the new extent block (so we can pass that to
* ocfs2_add_branch). */
- ret = ocfs2_shift_tree_depth(osb, handle, inode, di_bh,
+ ret = ocfs2_shift_tree_depth(osb, handle, inode, et,
meta_ac, &bh);
if (ret < 0) {
mlog_errno(ret);
@@ -975,7 +1279,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
/* call ocfs2_add_branch to add the final part of the tree with
* the new data. */
mlog(0, "add branch. bh = %p\n", bh);
- ret = ocfs2_add_branch(osb, handle, inode, di_bh, bh, last_eb_bh,
+ ret = ocfs2_add_branch(osb, handle, inode, et, bh, last_eb_bh,
meta_ac);
if (ret < 0) {
mlog_errno(ret);
@@ -990,15 +1294,6 @@ out:
}
/*
- * This is only valid for leaf nodes, which are the only ones that can
- * have empty extents anyway.
- */
-static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
-{
- return !rec->e_leaf_clusters;
-}
-
-/*
* This function will discard the rightmost extent record.
*/
static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
@@ -1245,8 +1540,7 @@ static int __ocfs2_find_path(struct inode *inode,
brelse(bh);
bh = NULL;
- ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
- &bh, OCFS2_BH_CACHED, inode);
+ ret = ocfs2_read_block(inode, blkno, &bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -2067,11 +2361,11 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
struct ocfs2_path *right_path,
int subtree_index,
struct ocfs2_cached_dealloc_ctxt *dealloc,
- int *deleted)
+ int *deleted,
+ struct ocfs2_extent_tree *et)
{
int ret, i, del_right_subtree = 0, right_has_empty = 0;
- struct buffer_head *root_bh, *di_bh = path_root_bh(right_path);
- struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
struct ocfs2_extent_block *eb;
@@ -2123,7 +2417,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
* We have to update i_last_eb_blk during the meta
* data delete.
*/
- ret = ocfs2_journal_access(handle, inode, di_bh,
+ ret = ocfs2_journal_access(handle, inode, et_root_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -2198,7 +2492,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
ocfs2_update_edge_lengths(inode, handle, left_path);
eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
- di->i_last_eb_blk = eb->h_blkno;
+ ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
/*
* Removal of the extent in the left leaf was skipped
@@ -2208,7 +2502,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
if (right_has_empty)
ocfs2_remove_empty_extent(left_leaf_el);
- ret = ocfs2_journal_dirty(handle, di_bh);
+ ret = ocfs2_journal_dirty(handle, et_root_bh);
if (ret)
mlog_errno(ret);
@@ -2331,7 +2625,8 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
handle_t *handle, int orig_credits,
struct ocfs2_path *path,
struct ocfs2_cached_dealloc_ctxt *dealloc,
- struct ocfs2_path **empty_extent_path)
+ struct ocfs2_path **empty_extent_path,
+ struct ocfs2_extent_tree *et)
{
int ret, subtree_root, deleted;
u32 right_cpos;
@@ -2404,7 +2699,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
right_path, subtree_root,
- dealloc, &deleted);
+ dealloc, &deleted, et);
if (ret == -EAGAIN) {
/*
* The rotation has to temporarily stop due to
@@ -2447,29 +2742,20 @@ out:
}
static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
- struct ocfs2_path *path,
- struct ocfs2_cached_dealloc_ctxt *dealloc)
+ struct ocfs2_path *path,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ struct ocfs2_extent_tree *et)
{
int ret, subtree_index;
u32 cpos;
struct ocfs2_path *left_path = NULL;
- struct ocfs2_dinode *di;
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el;
- /*
- * XXX: This code assumes that the root is an inode, which is
- * true for now but may change as tree code gets generic.
- */
- di = (struct ocfs2_dinode *)path_root_bh(path)->b_data;
- if (!OCFS2_IS_VALID_DINODE(di)) {
- ret = -EIO;
- ocfs2_error(inode->i_sb,
- "Inode %llu has invalid path root",
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
- goto out;
- }
+ ret = ocfs2_et_sanity_check(inode, et);
+ if (ret)
+ goto out;
/*
* There's two ways we handle this depending on
* whether path is the only existing one.
@@ -2526,7 +2812,7 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
ocfs2_update_edge_lengths(inode, handle, left_path);
eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
- di->i_last_eb_blk = eb->h_blkno;
+ ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
} else {
/*
* 'path' is also the leftmost path which
@@ -2537,12 +2823,12 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
*/
ocfs2_unlink_path(inode, handle, dealloc, path, 1);
- el = &di->id2.i_list;
+ el = et->et_root_el;
el->l_tree_depth = 0;
el->l_next_free_rec = 0;
memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
- di->i_last_eb_blk = 0;
+ ocfs2_et_set_last_eb_blk(et, 0);
}
ocfs2_journal_dirty(handle, path_root_bh(path));
@@ -2570,7 +2856,8 @@ out:
*/
static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
struct ocfs2_path *path,
- struct ocfs2_cached_dealloc_ctxt *dealloc)
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ struct ocfs2_extent_tree *et)
{
int ret, orig_credits = handle->h_buffer_credits;
struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
@@ -2584,7 +2871,7 @@ static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
if (path->p_tree_depth == 0) {
rightmost_no_delete:
/*
- * In-inode extents. This is trivially handled, so do
+ * Inline extents. This is trivially handled, so do
* it up front.
*/
ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
@@ -2638,7 +2925,7 @@ rightmost_no_delete:
*/
ret = ocfs2_remove_rightmost_path(inode, handle, path,
- dealloc);
+ dealloc, et);
if (ret)
mlog_errno(ret);
goto out;
@@ -2650,7 +2937,7 @@ rightmost_no_delete:
*/
try_rotate:
ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
- dealloc, &restart_path);
+ dealloc, &restart_path, et);
if (ret && ret != -EAGAIN) {
mlog_errno(ret);
goto out;
@@ -2662,7 +2949,7 @@ try_rotate:
ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
tmp_path, dealloc,
- &restart_path);
+ &restart_path, et);
if (ret && ret != -EAGAIN) {
mlog_errno(ret);
goto out;
@@ -2948,6 +3235,7 @@ static int ocfs2_merge_rec_left(struct inode *inode,
handle_t *handle,
struct ocfs2_extent_rec *split_rec,
struct ocfs2_cached_dealloc_ctxt *dealloc,
+ struct ocfs2_extent_tree *et,
int index)
{
int ret, i, subtree_index = 0, has_empty_extent = 0;
@@ -3068,7 +3356,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
le16_to_cpu(el->l_next_free_rec) == 1) {
ret = ocfs2_remove_rightmost_path(inode, handle,
- right_path, dealloc);
+ right_path,
+ dealloc, et);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3095,7 +3384,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
int split_index,
struct ocfs2_extent_rec *split_rec,
struct ocfs2_cached_dealloc_ctxt *dealloc,
- struct ocfs2_merge_ctxt *ctxt)
+ struct ocfs2_merge_ctxt *ctxt,
+ struct ocfs2_extent_tree *et)
{
int ret = 0;
@@ -3113,7 +3403,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
* illegal.
*/
ret = ocfs2_rotate_tree_left(inode, handle, path,
- dealloc);
+ dealloc, et);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3156,7 +3446,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
/* The merge left us with an empty extent, remove it. */
- ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
+ ret = ocfs2_rotate_tree_left(inode, handle, path,
+ dealloc, et);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3170,7 +3461,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
*/
ret = ocfs2_merge_rec_left(inode, path,
handle, rec,
- dealloc,
+ dealloc, et,
split_index);
if (ret) {
@@ -3179,7 +3470,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
}
ret = ocfs2_rotate_tree_left(inode, handle, path,
- dealloc);
+ dealloc, et);
/*
* Error from this last rotate is not critical, so
* print but don't bubble it up.
@@ -3199,7 +3490,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
ret = ocfs2_merge_rec_left(inode,
path,
handle, split_rec,
- dealloc,
+ dealloc, et,
split_index);
if (ret) {
mlog_errno(ret);
@@ -3222,7 +3513,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
* our leaf. Try to rotate it away.
*/
ret = ocfs2_rotate_tree_left(inode, handle, path,
- dealloc);
+ dealloc, et);
if (ret)
mlog_errno(ret);
ret = 0;
@@ -3356,16 +3647,6 @@ rotate:
ocfs2_rotate_leaf(el, insert_rec);
}
-static inline void ocfs2_update_dinode_clusters(struct inode *inode,
- struct ocfs2_dinode *di,
- u32 clusters)
-{
- le32_add_cpu(&di->i_clusters, clusters);
- spin_lock(&OCFS2_I(inode)->ip_lock);
- OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
- spin_unlock(&OCFS2_I(inode)->ip_lock);
-}
-
static void ocfs2_adjust_rightmost_records(struct inode *inode,
handle_t *handle,
struct ocfs2_path *path,
@@ -3567,8 +3848,8 @@ static void ocfs2_split_record(struct inode *inode,
}
/*
- * This function only does inserts on an allocation b-tree. For dinode
- * lists, ocfs2_insert_at_leaf() is called directly.
+ * This function only does inserts on an allocation b-tree. For tree
+ * depth = 0, ocfs2_insert_at_leaf() is called directly.
*
* right_path is the path we want to do the actual insert
* in. left_path should only be passed in if we need to update that
@@ -3665,7 +3946,7 @@ out:
static int ocfs2_do_insert_extent(struct inode *inode,
handle_t *handle,
- struct buffer_head *di_bh,
+ struct ocfs2_extent_tree *et,
struct ocfs2_extent_rec *insert_rec,
struct ocfs2_insert_type *type)
{
@@ -3673,13 +3954,11 @@ static int ocfs2_do_insert_extent(struct inode *inode,
u32 cpos;
struct ocfs2_path *right_path = NULL;
struct ocfs2_path *left_path = NULL;
- struct ocfs2_dinode *di;
struct ocfs2_extent_list *el;
- di = (struct ocfs2_dinode *) di_bh->b_data;
- el = &di->id2.i_list;
+ el = et->et_root_el;
- ret = ocfs2_journal_access(handle, inode, di_bh,
+ ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -3691,7 +3970,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
goto out_update_clusters;
}
- right_path = ocfs2_new_inode_path(di_bh);
+ right_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
if (!right_path) {
ret = -ENOMEM;
mlog_errno(ret);
@@ -3741,7 +4020,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
* ocfs2_rotate_tree_right() might have extended the
* transaction without re-journaling our tree root.
*/
- ret = ocfs2_journal_access(handle, inode, di_bh,
+ ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
@@ -3766,10 +4045,10 @@ static int ocfs2_do_insert_extent(struct inode *inode,
out_update_clusters:
if (type->ins_split == SPLIT_NONE)
- ocfs2_update_dinode_clusters(inode, di,
- le16_to_cpu(insert_rec->e_leaf_clusters));
+ ocfs2_et_update_clusters(inode, et,
+ le16_to_cpu(insert_rec->e_leaf_clusters));
- ret = ocfs2_journal_dirty(handle, di_bh);
+ ret = ocfs2_journal_dirty(handle, et->et_root_bh);
if (ret)
mlog_errno(ret);
@@ -3899,7 +4178,8 @@ out:
static void ocfs2_figure_contig_type(struct inode *inode,
struct ocfs2_insert_type *insert,
struct ocfs2_extent_list *el,
- struct ocfs2_extent_rec *insert_rec)
+ struct ocfs2_extent_rec *insert_rec,
+ struct ocfs2_extent_tree *et)
{
int i;
enum ocfs2_contig_type contig_type = CONTIG_NONE;
@@ -3915,6 +4195,21 @@ static void ocfs2_figure_contig_type(struct inode *inode,
}
}
insert->ins_contig = contig_type;
+
+ if (insert->ins_contig != CONTIG_NONE) {
+ struct ocfs2_extent_rec *rec =
+ &el->l_recs[insert->ins_contig_index];
+ unsigned int len = le16_to_cpu(rec->e_leaf_clusters) +
+ le16_to_cpu(insert_rec->e_leaf_clusters);
+
+ /*
+ * Caller might want us to limit the size of extents, don't
+ * calculate contiguousness if we might exceed that limit.
+ */
+ if (et->et_max_leaf_clusters &&
+ (len > et->et_max_leaf_clusters))
+ insert->ins_contig = CONTIG_NONE;
+ }
}
/*
@@ -3923,8 +4218,8 @@ static void ocfs2_figure_contig_type(struct inode *inode,
* ocfs2_figure_appending_type() will figure out whether we'll have to
* insert at the tail of the rightmost leaf.
*
- * This should also work against the dinode list for tree's with 0
- * depth. If we consider the dinode list to be the rightmost leaf node
+ * This should also work against the root extent list for tree's with 0
+ * depth. If we consider the root extent list to be the rightmost leaf node
* then the logic here makes sense.
*/
static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
@@ -3975,14 +4270,13 @@ set_tail_append:
* structure.
*/
static int ocfs2_figure_insert_type(struct inode *inode,
- struct buffer_head *di_bh,
+ struct ocfs2_extent_tree *et,
struct buffer_head **last_eb_bh,
struct ocfs2_extent_rec *insert_rec,
int *free_records,
struct ocfs2_insert_type *insert)
{
int ret;
- struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el;
struct ocfs2_path *path = NULL;
@@ -3990,7 +4284,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
insert->ins_split = SPLIT_NONE;
- el = &di->id2.i_list;
+ el = et->et_root_el;
insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
if (el->l_tree_depth) {
@@ -4000,9 +4294,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
* ocfs2_figure_insert_type() and ocfs2_add_branch()
* may want it later.
*/
- ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
- le64_to_cpu(di->i_last_eb_blk), &bh,
- OCFS2_BH_CACHED, inode);
+ ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), &bh);
if (ret) {
mlog_exit(ret);
goto out;
@@ -4023,12 +4315,12 @@ static int ocfs2_figure_insert_type(struct inode *inode,
le16_to_cpu(el->l_next_free_rec);
if (!insert->ins_tree_depth) {
- ocfs2_figure_contig_type(inode, insert, el, insert_rec);
+ ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
ocfs2_figure_appending_type(insert, el, insert_rec);
return 0;
}
- path = ocfs2_new_inode_path(di_bh);
+ path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
if (!path) {
ret = -ENOMEM;
mlog_errno(ret);
@@ -4057,7 +4349,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
* into two types of appends: simple record append, or a
* rotate inside the tail leaf.
*/
- ocfs2_figure_contig_type(inode, insert, el, insert_rec);
+ ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
/*
* The insert code isn't quite ready to deal with all cases of
@@ -4078,7 +4370,8 @@ static int ocfs2_figure_insert_type(struct inode *inode,
* the case that we're doing a tail append, so maybe we can
* take advantage of that information somehow.
*/
- if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) {
+ if (ocfs2_et_get_last_eb_blk(et) ==
+ path_leaf_bh(path)->b_blocknr) {
/*
* Ok, ocfs2_find_path() returned us the rightmost
* tree path. This might be an appending insert. There are
@@ -4108,7 +4401,7 @@ out:
int ocfs2_insert_extent(struct ocfs2_super *osb,
handle_t *handle,
struct inode *inode,
- struct buffer_head *fe_bh,
+ struct ocfs2_extent_tree *et,
u32 cpos,
u64 start_blk,
u32 new_clusters,
@@ -4121,26 +4414,21 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
struct ocfs2_insert_type insert = {0, };
struct ocfs2_extent_rec rec;
- BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
-
mlog(0, "add %u clusters at position %u to inode %llu\n",
new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
- mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
- (OCFS2_I(inode)->ip_clusters != cpos),
- "Device %s, asking for sparse allocation: inode %llu, "
- "cpos %u, clusters %u\n",
- osb->dev_str,
- (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos,
- OCFS2_I(inode)->ip_clusters);
-
memset(&rec, 0, sizeof(rec));
rec.e_cpos = cpu_to_le32(cpos);
rec.e_blkno = cpu_to_le64(start_blk);
rec.e_leaf_clusters = cpu_to_le16(new_clusters);
rec.e_flags = flags;
+ status = ocfs2_et_insert_check(inode, et, &rec);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
- status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
+ status = ocfs2_figure_insert_type(inode, et, &last_eb_bh, &rec,
&free_records, &insert);
if (status < 0) {
mlog_errno(status);
@@ -4154,7 +4442,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
free_records, insert.ins_tree_depth);
if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
- status = ocfs2_grow_tree(inode, handle, fe_bh,
+ status = ocfs2_grow_tree(inode, handle, et,
&insert.ins_tree_depth, &last_eb_bh,
meta_ac);
if (status) {
@@ -4164,17 +4452,124 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
}
/* Finally, we can add clusters. This might rotate the tree for us. */
- status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert);
+ status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert);
if (status < 0)
mlog_errno(status);
- else
+ else if (et->et_ops == &ocfs2_dinode_et_ops)
ocfs2_extent_map_insert_rec(inode, &rec);
bail:
- if (last_eb_bh)
- brelse(last_eb_bh);
+ brelse(last_eb_bh);
+
+ mlog_exit(status);
+ return status;
+}
+
+/*
+ * Allcate and add clusters into the extent b-tree.
+ * The new clusters(clusters_to_add) will be inserted at logical_offset.
+ * The extent b-tree's root is specified by et, and
+ * it is not limited to the file storage. Any extent tree can use this
+ * function if it implements the proper ocfs2_extent_tree.
+ */
+int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
+ struct inode *inode,
+ u32 *logical_offset,
+ u32 clusters_to_add,
+ int mark_unwritten,
+ struct ocfs2_extent_tree *et,
+ handle_t *handle,
+ struct ocfs2_alloc_context *data_ac,
+ struct ocfs2_alloc_context *meta_ac,
+ enum ocfs2_alloc_restarted *reason_ret)
+{
+ int status = 0;
+ int free_extents;
+ enum ocfs2_alloc_restarted reason = RESTART_NONE;
+ u32 bit_off, num_bits;
+ u64 block;
+ u8 flags = 0;
+
+ BUG_ON(!clusters_to_add);
+
+ if (mark_unwritten)
+ flags = OCFS2_EXT_UNWRITTEN;
+
+ free_extents = ocfs2_num_free_extents(osb, inode, et);
+ if (free_extents < 0) {
+ status = free_extents;
+ mlog_errno(status);
+ goto leave;
+ }
+
+ /* there are two cases which could cause us to EAGAIN in the
+ * we-need-more-metadata case:
+ * 1) we haven't reserved *any*
+ * 2) we are so fragmented, we've needed to add metadata too
+ * many times. */
+ if (!free_extents && !meta_ac) {
+ mlog(0, "we haven't reserved any metadata!\n");
+ status = -EAGAIN;
+ reason = RESTART_META;
+ goto leave;
+ } else if ((!free_extents)
+ && (ocfs2_alloc_context_bits_left(meta_ac)
+ < ocfs2_extend_meta_needed(et->et_root_el))) {
+ mlog(0, "filesystem is really fragmented...\n");
+ status = -EAGAIN;
+ reason = RESTART_META;
+ goto leave;
+ }
+
+ status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
+ clusters_to_add, &bit_off, &num_bits);
+ if (status < 0) {
+ if (status != -ENOSPC)
+ mlog_errno(status);
+ goto leave;
+ }
+ BUG_ON(num_bits > clusters_to_add);
+
+ /* reserve our write early -- insert_extent may update the inode */
+ status = ocfs2_journal_access(handle, inode, et->et_root_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
+
+ block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+ mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
+ num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ status = ocfs2_insert_extent(osb, handle, inode, et,
+ *logical_offset, block,
+ num_bits, flags, meta_ac);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
+
+ status = ocfs2_journal_dirty(handle, et->et_root_bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
+
+ clusters_to_add -= num_bits;
+ *logical_offset += num_bits;
+
+ if (clusters_to_add) {
+ mlog(0, "need to alloc once more, wanted = %u\n",
+ clusters_to_add);
+ status = -EAGAIN;
+ reason = RESTART_TRANS;
+ }
+
+leave:
mlog_exit(status);
+ if (reason_ret)
+ *reason_ret = reason;
return status;
}
@@ -4201,7 +4596,7 @@ static void ocfs2_make_right_split_rec(struct super_block *sb,
static int ocfs2_split_and_insert(struct inode *inode,
handle_t *handle,
struct ocfs2_path *path,
- struct buffer_head *di_bh,
+ struct ocfs2_extent_tree *et,
struct buffer_head **last_eb_bh,
int split_index,
struct ocfs2_extent_rec *orig_split_rec,
@@ -4215,7 +4610,6 @@ static int ocfs2_split_and_insert(struct inode *inode,
struct ocfs2_extent_rec split_rec = *orig_split_rec;
struct ocfs2_insert_type insert;
struct ocfs2_extent_block *eb;
- struct ocfs2_dinode *di;
leftright:
/*
@@ -4224,8 +4618,7 @@ leftright:
*/
rec = path_leaf_el(path)->l_recs[split_index];
- di = (struct ocfs2_dinode *)di_bh->b_data;
- rightmost_el = &di->id2.i_list;
+ rightmost_el = et->et_root_el;
depth = le16_to_cpu(rightmost_el->l_tree_depth);
if (depth) {
@@ -4236,8 +4629,8 @@ leftright:
if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
le16_to_cpu(rightmost_el->l_count)) {
- ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, last_eb_bh,
- meta_ac);
+ ret = ocfs2_grow_tree(inode, handle, et,
+ &depth, last_eb_bh, meta_ac);
if (ret) {
mlog_errno(ret);
goto out;
@@ -4274,8 +4667,7 @@ leftright:
do_leftright = 1;
}
- ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec,
- &insert);
+ ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
if (ret) {
mlog_errno(ret);
goto out;
@@ -4317,8 +4709,9 @@ out:
* of the tree is required. All other cases will degrade into a less
* optimal tree layout.
*
- * last_eb_bh should be the rightmost leaf block for any inode with a
- * btree. Since a split may grow the tree or a merge might shrink it, the caller cannot trust the contents of that buffer after this call.
+ * last_eb_bh should be the rightmost leaf block for any extent
+ * btree. Since a split may grow the tree or a merge might shrink it,
+ * the caller cannot trust the contents of that buffer after this call.
*
* This code is optimized for readability - several passes might be
* made over certain portions of the tree. All of those blocks will
@@ -4326,7 +4719,7 @@ out:
* extra overhead is not expressed in terms of disk reads.
*/
static int __ocfs2_mark_extent_written(struct inode *inode,
- struct buffer_head *di_bh,
+ struct ocfs2_extent_tree *et,
handle_t *handle,
struct ocfs2_path *path,
int split_index,
@@ -4366,11 +4759,9 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
*/
if (path->p_tree_depth) {
struct ocfs2_extent_block *eb;
- struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
- ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
- le64_to_cpu(di->i_last_eb_blk),
- &last_eb_bh, OCFS2_BH_CACHED, inode);
+ ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et),
+ &last_eb_bh);
if (ret) {
mlog_exit(ret);
goto out;
@@ -4403,7 +4794,7 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
if (ctxt.c_split_covers_rec)
el->l_recs[split_index] = *split_rec;
else
- ret = ocfs2_split_and_insert(inode, handle, path, di_bh,
+ ret = ocfs2_split_and_insert(inode, handle, path, et,
&last_eb_bh, split_index,
split_rec, meta_ac);
if (ret)
@@ -4411,7 +4802,7 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
} else {
ret = ocfs2_try_to_merge_extent(inode, handle, path,
split_index, split_rec,
- dealloc, &ctxt);
+ dealloc, &ctxt, et);
if (ret)
mlog_errno(ret);
}
@@ -4429,7 +4820,8 @@ out:
*
* The caller is responsible for passing down meta_ac if we'll need it.
*/
-int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
+int ocfs2_mark_extent_written(struct inode *inode,
+ struct ocfs2_extent_tree *et,
handle_t *handle, u32 cpos, u32 len, u32 phys,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_cached_dealloc_ctxt *dealloc)
@@ -4455,10 +4847,14 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
/*
* XXX: This should be fixed up so that we just re-insert the
* next extent records.
+ *
+ * XXX: This is a hack on the extent tree, maybe it should be
+ * an op?
*/
- ocfs2_extent_map_trunc(inode, 0);
+ if (et->et_ops == &ocfs2_dinode_et_ops)
+ ocfs2_extent_map_trunc(inode, 0);
- left_path = ocfs2_new_inode_path(di_bh);
+ left_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
if (!left_path) {
ret = -ENOMEM;
mlog_errno(ret);
@@ -4489,8 +4885,9 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
- ret = __ocfs2_mark_extent_written(inode, di_bh, handle, left_path,
- index, &split_rec, meta_ac, dealloc);
+ ret = __ocfs2_mark_extent_written(inode, et, handle, left_path,
+ index, &split_rec, meta_ac,
+ dealloc);
if (ret)
mlog_errno(ret);
@@ -4499,13 +4896,12 @@ out:
return ret;
}
-static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
+static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
handle_t *handle, struct ocfs2_path *path,
int index, u32 new_range,
struct ocfs2_alloc_context *meta_ac)
{
int ret, depth, credits = handle->h_buffer_credits;
- struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
struct buffer_head *last_eb_bh = NULL;
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *rightmost_el, *el;
@@ -4522,9 +4918,8 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
depth = path->p_tree_depth;
if (depth > 0) {
- ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
- le64_to_cpu(di->i_last_eb_blk),
- &last_eb_bh, OCFS2_BH_CACHED, inode);
+ ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et),
+ &last_eb_bh);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -4535,7 +4930,8 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
} else
rightmost_el = path_leaf_el(path);
- credits += path->p_tree_depth + ocfs2_extend_meta_needed(di);
+ credits += path->p_tree_depth +
+ ocfs2_extend_meta_needed(et->et_root_el);
ret = ocfs2_extend_trans(handle, credits);
if (ret) {
mlog_errno(ret);
@@ -4544,7 +4940,7 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
le16_to_cpu(rightmost_el->l_count)) {
- ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, &last_eb_bh,
+ ret = ocfs2_grow_tree(inode, handle, et, &depth, &last_eb_bh,
meta_ac);
if (ret) {
mlog_errno(ret);
@@ -4558,7 +4954,7 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
insert.ins_split = SPLIT_RIGHT;
insert.ins_tree_depth = depth;
- ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, &insert);
+ ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
if (ret)
mlog_errno(ret);
@@ -4570,7 +4966,8 @@ out:
static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
struct ocfs2_path *path, int index,
struct ocfs2_cached_dealloc_ctxt *dealloc,
- u32 cpos, u32 len)
+ u32 cpos, u32 len,
+ struct ocfs2_extent_tree *et)
{
int ret;
u32 left_cpos, rec_range, trunc_range;
@@ -4582,7 +4979,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
struct ocfs2_extent_block *eb;
if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
- ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
+ ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
if (ret) {
mlog_errno(ret);
goto out;
@@ -4713,7 +5110,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
ocfs2_journal_dirty(handle, path_leaf_bh(path));
- ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
+ ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
if (ret) {
mlog_errno(ret);
goto out;
@@ -4724,7 +5121,8 @@ out:
return ret;
}
-int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
+int ocfs2_remove_extent(struct inode *inode,
+ struct ocfs2_extent_tree *et,
u32 cpos, u32 len, handle_t *handle,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_cached_dealloc_ctxt *dealloc)
@@ -4733,11 +5131,11 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
u32 rec_range, trunc_range;
struct ocfs2_extent_rec *rec;
struct ocfs2_extent_list *el;
- struct ocfs2_path *path;
+ struct ocfs2_path *path = NULL;
ocfs2_extent_map_trunc(inode, 0);
- path = ocfs2_new_inode_path(di_bh);
+ path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
if (!path) {
ret = -ENOMEM;
mlog_errno(ret);
@@ -4790,13 +5188,13 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
- cpos, len);
+ cpos, len, et);
if (ret) {
mlog_errno(ret);
goto out;
}
} else {
- ret = ocfs2_split_tree(inode, di_bh, handle, path, index,
+ ret = ocfs2_split_tree(inode, et, handle, path, index,
trunc_range, meta_ac);
if (ret) {
mlog_errno(ret);
@@ -4845,7 +5243,7 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
}
ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
- cpos, len);
+ cpos, len, et);
if (ret) {
mlog_errno(ret);
goto out;
@@ -5188,8 +5586,7 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
goto bail;
}
- status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
- OCFS2_BH_CACHED, inode);
+ status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
if (status < 0) {
iput(inode);
mlog_errno(status);
@@ -5264,8 +5661,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
bail:
if (tl_inode)
iput(tl_inode);
- if (tl_bh)
- brelse(tl_bh);
+ brelse(tl_bh);
if (status < 0 && (*tl_copy)) {
kfree(*tl_copy);
@@ -6008,20 +6404,13 @@ bail:
return status;
}
-static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh)
+static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
{
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
return 0;
}
-static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
-{
- set_buffer_uptodate(bh);
- mark_buffer_dirty(bh);
- return ocfs2_journal_dirty_data(handle, bh);
-}
-
static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
unsigned int from, unsigned int to,
struct page *page, int zero, u64 *phys)
@@ -6040,17 +6429,18 @@ static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
* here if they aren't - ocfs2_map_page_blocks()
* might've skipped some
*/
- if (ocfs2_should_order_data(inode)) {
- ret = walk_page_buffers(handle,
- page_buffers(page),
- from, to, &partial,
- ocfs2_ordered_zero_func);
- if (ret < 0)
- mlog_errno(ret);
- } else {
+ ret = walk_page_buffers(handle, page_buffers(page),
+ from, to, &partial,
+ ocfs2_zero_func);
+ if (ret < 0)
+ mlog_errno(ret);
+ else if (ocfs2_should_order_data(inode)) {
+ ret = ocfs2_jbd2_file_inode(handle, inode);
+#ifdef CONFIG_OCFS2_COMPAT_JBD
ret = walk_page_buffers(handle, page_buffers(page),
from, to, &partial,
- ocfs2_writeback_zero_func);
+ ocfs2_journal_dirty_data);
+#endif
if (ret < 0)
mlog_errno(ret);
}
@@ -6215,20 +6605,29 @@ out:
return ret;
}
-static void ocfs2_zero_dinode_id2(struct inode *inode, struct ocfs2_dinode *di)
+static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode,
+ struct ocfs2_dinode *di)
{
unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
+ unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
- memset(&di->id2, 0, blocksize - offsetof(struct ocfs2_dinode, id2));
+ if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
+ memset(&di->id2, 0, blocksize -
+ offsetof(struct ocfs2_dinode, id2) -
+ xattrsize);
+ else
+ memset(&di->id2, 0, blocksize -
+ offsetof(struct ocfs2_dinode, id2));
}
void ocfs2_dinode_new_extent_list(struct inode *inode,
struct ocfs2_dinode *di)
{
- ocfs2_zero_dinode_id2(inode, di);
+ ocfs2_zero_dinode_id2_with_xattr(inode, di);
di->id2.i_list.l_tree_depth = 0;
di->id2.i_list.l_next_free_rec = 0;
- di->id2.i_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(inode->i_sb));
+ di->id2.i_list.l_count = cpu_to_le16(
+ ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di));
}
void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
@@ -6245,9 +6644,10 @@ void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
* We clear the entire i_data structure here so that all
* fields can be properly initialized.
*/
- ocfs2_zero_dinode_id2(inode, di);
+ ocfs2_zero_dinode_id2_with_xattr(inode, di);
- idata->id_count = cpu_to_le16(ocfs2_max_inline_data(inode->i_sb));
+ idata->id_count = cpu_to_le16(
+ ocfs2_max_inline_data_with_xattr(inode->i_sb, di));
}
int ocfs2_convert_inline_data_to_extents(struct inode *inode,
@@ -6262,6 +6662,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
struct ocfs2_alloc_context *data_ac = NULL;
struct page **pages = NULL;
loff_t end = osb->s_clustersize;
+ struct ocfs2_extent_tree et;
has_data = i_size_read(inode) ? 1 : 0;
@@ -6361,7 +6762,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
* this proves to be false, we could always re-build
* the in-inode data from our pages.
*/
- ret = ocfs2_insert_extent(osb, handle, inode, di_bh,
+ ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
+ ret = ocfs2_insert_extent(osb, handle, inode, &et,
0, block, 1, 0, NULL);
if (ret) {
mlog_errno(ret);
@@ -6404,13 +6806,14 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
handle_t *handle = NULL;
struct inode *tl_inode = osb->osb_tl_inode;
struct ocfs2_path *path = NULL;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
mlog_entry_void();
new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
i_size_read(inode));
- path = ocfs2_new_inode_path(fe_bh);
+ path = ocfs2_new_path(fe_bh, &di->id2.i_list);
if (!path) {
status = -ENOMEM;
mlog_errno(status);
@@ -6581,8 +6984,8 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
if (fe->id2.i_list.l_tree_depth) {
- status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
- &last_eb_bh, OCFS2_BH_CACHED, inode);
+ status = ocfs2_read_block(inode, le64_to_cpu(fe->i_last_eb_blk),
+ &last_eb_bh);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -6695,8 +7098,7 @@ static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
mlog(ML_NOTICE,
"Truncate completion has non-empty dealloc context\n");
- if (tc->tc_last_eb_bh)
- brelse(tc->tc_last_eb_bh);
+ brelse(tc->tc_last_eb_bh);
kfree(tc);
}
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 42ff94bd801..70257c84cfb 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -26,30 +26,102 @@
#ifndef OCFS2_ALLOC_H
#define OCFS2_ALLOC_H
+
+/*
+ * For xattr tree leaf, we limit the leaf byte size to be 64K.
+ */
+#define OCFS2_MAX_XATTR_TREE_LEAF_SIZE 65536
+
+/*
+ * ocfs2_extent_tree and ocfs2_extent_tree_operations are used to abstract
+ * the b-tree operations in ocfs2. Now all the b-tree operations are not
+ * limited to ocfs2_dinode only. Any data which need to allocate clusters
+ * to store can use b-tree. And it only needs to implement its ocfs2_extent_tree
+ * and operation.
+ *
+ * ocfs2_extent_tree becomes the first-class object for extent tree
+ * manipulation. Callers of the alloc.c code need to fill it via one of
+ * the ocfs2_init_*_extent_tree() operations below.
+ *
+ * ocfs2_extent_tree contains info for the root of the b-tree, it must have a
+ * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
+ * functions.
+ * ocfs2_extent_tree_operations abstract the normal operations we do for
+ * the root of extent b-tree.
+ */
+struct ocfs2_extent_tree_operations;
+struct ocfs2_extent_tree {
+ struct ocfs2_extent_tree_operations *et_ops;
+ struct buffer_head *et_root_bh;
+ struct ocfs2_extent_list *et_root_el;
+ void *et_object;
+ unsigned int et_max_leaf_clusters;
+};
+
+/*
+ * ocfs2_init_*_extent_tree() will fill an ocfs2_extent_tree from the
+ * specified object buffer.
+ */
+void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
+ struct inode *inode,
+ struct buffer_head *bh);
+void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
+ struct inode *inode,
+ struct buffer_head *bh);
+void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
+ struct inode *inode,
+ struct buffer_head *bh,
+ struct ocfs2_xattr_value_root *xv);
+
struct ocfs2_alloc_context;
int ocfs2_insert_extent(struct ocfs2_super *osb,
handle_t *handle,
struct inode *inode,
- struct buffer_head *fe_bh,
+ struct ocfs2_extent_tree *et,
u32 cpos,
u64 start_blk,
u32 new_clusters,
u8 flags,
struct ocfs2_alloc_context *meta_ac);
+
+enum ocfs2_alloc_restarted {
+ RESTART_NONE = 0,
+ RESTART_TRANS,
+ RESTART_META
+};
+int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
+ struct inode *inode,
+ u32 *logical_offset,
+ u32 clusters_to_add,
+ int mark_unwritten,
+ struct ocfs2_extent_tree *et,
+ handle_t *handle,
+ struct ocfs2_alloc_context *data_ac,
+ struct ocfs2_alloc_context *meta_ac,
+ enum ocfs2_alloc_restarted *reason_ret);
struct ocfs2_cached_dealloc_ctxt;
-int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
+int ocfs2_mark_extent_written(struct inode *inode,
+ struct ocfs2_extent_tree *et,
handle_t *handle, u32 cpos, u32 len, u32 phys,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_cached_dealloc_ctxt *dealloc);
-int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
+int ocfs2_remove_extent(struct inode *inode,
+ struct ocfs2_extent_tree *et,
u32 cpos, u32 len, handle_t *handle,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_cached_dealloc_ctxt *dealloc);
int ocfs2_num_free_extents(struct ocfs2_super *osb,
struct inode *inode,
- struct ocfs2_dinode *fe);
-/* how many new metadata chunks would an allocation need at maximum? */
-static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe)
+ struct ocfs2_extent_tree *et);
+
+/*
+ * how many new metadata chunks would an allocation need at maximum?
+ *
+ * Please note that the caller must make sure that root_el is the root
+ * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
+ * the result may be wrong.
+ */
+static inline int ocfs2_extend_meta_needed(struct ocfs2_extent_list *root_el)
{
/*
* Rather than do all the work of determining how much we need
@@ -59,7 +131,7 @@ static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe)
* new tree_depth==0 extent_block, and one block at the new
* top-of-the tree.
*/
- return le16_to_cpu(fe->id2.i_list.l_tree_depth) + 2;
+ return le16_to_cpu(root_el->l_tree_depth) + 2;
}
void ocfs2_dinode_new_extent_list(struct inode *inode, struct ocfs2_dinode *di);
@@ -146,4 +218,13 @@ static inline unsigned int ocfs2_rec_clusters(struct ocfs2_extent_list *el,
return le16_to_cpu(rec->e_leaf_clusters);
}
+/*
+ * This is only valid for leaf nodes, which are the only ones that can
+ * have empty extents anyway.
+ */
+static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
+{
+ return !rec->e_leaf_clusters;
+}
+
#endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 17964c0505a..c22543b3342 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -68,9 +68,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
goto bail;
}
- status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
- OCFS2_I(inode)->ip_blkno,
- &bh, OCFS2_BH_CACHED, inode);
+ status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -128,8 +126,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
err = 0;
bail:
- if (bh)
- brelse(bh);
+ brelse(bh);
mlog_exit(err);
return err;
@@ -174,10 +171,17 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
* need to use BH_New is when we're extending i_size on a file
* system which doesn't support holes, in which case BH_New
* allows block_prepare_write() to zero.
+ *
+ * If we see this on a sparse file system, then a truncate has
+ * raced us and removed the cluster. In this case, we clear
+ * the buffers dirty and uptodate bits and let the buffer code
+ * ignore it as a hole.
*/
- mlog_bug_on_msg(create && p_blkno == 0 && ocfs2_sparse_alloc(osb),
- "ino %lu, iblock %llu\n", inode->i_ino,
- (unsigned long long)iblock);
+ if (create && p_blkno == 0 && ocfs2_sparse_alloc(osb)) {
+ clear_buffer_dirty(bh_result);
+ clear_buffer_uptodate(bh_result);
+ goto bail;
+ }
/* Treat the unwritten extent as a hole for zeroing purposes. */
if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
@@ -254,13 +258,11 @@ static int ocfs2_readpage_inline(struct inode *inode, struct page *page)
{
int ret;
struct buffer_head *di_bh = NULL;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
BUG_ON(!PageLocked(page));
BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
- ret = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &di_bh,
- OCFS2_BH_CACHED, inode);
+ ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -478,11 +480,14 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
}
if (ocfs2_should_order_data(inode)) {
+ ret = ocfs2_jbd2_file_inode(handle, inode);
+#ifdef CONFIG_OCFS2_COMPAT_JBD
ret = walk_page_buffers(handle,
page_buffers(page),
from, to, NULL,
ocfs2_journal_dirty_data);
- if (ret < 0)
+#endif
+ if (ret < 0)
mlog_errno(ret);
}
out:
@@ -587,7 +592,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
goto bail;
}
- if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) {
+ if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno && create) {
ocfs2_error(inode->i_sb,
"Inode %llu has a hole at block %llu\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -662,7 +667,7 @@ static void ocfs2_invalidatepage(struct page *page, unsigned long offset)
{
journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
- journal_invalidatepage(journal, page, offset);
+ jbd2_journal_invalidatepage(journal, page, offset);
}
static int ocfs2_releasepage(struct page *page, gfp_t wait)
@@ -671,7 +676,7 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait)
if (!page_has_buffers(page))
return 0;
- return journal_try_to_free_buffers(journal, page, wait);
+ return jbd2_journal_try_to_free_buffers(journal, page, wait);
}
static ssize_t ocfs2_direct_IO(int rw,
@@ -1066,12 +1071,19 @@ static void ocfs2_write_failure(struct inode *inode,
for(i = 0; i < wc->w_num_pages; i++) {
tmppage = wc->w_pages[i];
- if (ocfs2_should_order_data(inode))
- walk_page_buffers(wc->w_handle, page_buffers(tmppage),
- from, to, NULL,
- ocfs2_journal_dirty_data);
+ if (page_has_buffers(tmppage)) {
+ if (ocfs2_should_order_data(inode)) {
+ ocfs2_jbd2_file_inode(wc->w_handle, inode);
+#ifdef CONFIG_OCFS2_COMPAT_JBD
+ walk_page_buffers(wc->w_handle,
+ page_buffers(tmppage),
+ from, to, NULL,
+ ocfs2_journal_dirty_data);
+#endif
+ }
- block_commit_write(tmppage, from, to);
+ block_commit_write(tmppage, from, to);
+ }
}
}
@@ -1232,6 +1244,7 @@ static int ocfs2_write_cluster(struct address_space *mapping,
int ret, i, new, should_zero = 0;
u64 v_blkno, p_blkno;
struct inode *inode = mapping->host;
+ struct ocfs2_extent_tree et;
new = phys == 0 ? 1 : 0;
if (new || unwritten)
@@ -1245,10 +1258,10 @@ static int ocfs2_write_cluster(struct address_space *mapping,
* any additional semaphores or cluster locks.
*/
tmp_pos = cpos;
- ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode,
- &tmp_pos, 1, 0, wc->w_di_bh,
- wc->w_handle, data_ac,
- meta_ac, NULL);
+ ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode,
+ &tmp_pos, 1, 0, wc->w_di_bh,
+ wc->w_handle, data_ac,
+ meta_ac, NULL);
/*
* This shouldn't happen because we must have already
* calculated the correct meta data allocation required. The
@@ -1266,7 +1279,8 @@ static int ocfs2_write_cluster(struct address_space *mapping,
goto out;
}
} else if (unwritten) {
- ret = ocfs2_mark_extent_written(inode, wc->w_di_bh,
+ ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh);
+ ret = ocfs2_mark_extent_written(inode, &et,
wc->w_handle, cpos, 1, phys,
meta_ac, &wc->w_dealloc);
if (ret < 0) {
@@ -1655,6 +1669,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
struct ocfs2_alloc_context *data_ac = NULL;
struct ocfs2_alloc_context *meta_ac = NULL;
handle_t *handle;
+ struct ocfs2_extent_tree et;
ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
if (ret) {
@@ -1702,14 +1717,23 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
* ocfs2_lock_allocators(). It greatly over-estimates
* the work to be done.
*/
- ret = ocfs2_lock_allocators(inode, di, clusters_to_alloc,
- extents_to_split, &data_ac, &meta_ac);
+ mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u,"
+ " clusters_to_add = %u, extents_to_split = %u\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (long long)i_size_read(inode), le32_to_cpu(di->i_clusters),
+ clusters_to_alloc, extents_to_split);
+
+ ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh);
+ ret = ocfs2_lock_allocators(inode, &et,
+ clusters_to_alloc, extents_to_split,
+ &data_ac, &meta_ac);
if (ret) {
mlog_errno(ret);
goto out;
}
- credits = ocfs2_calc_extend_credits(inode->i_sb, di,
+ credits = ocfs2_calc_extend_credits(inode->i_sb,
+ &di->id2.i_list,
clusters_to_alloc);
}
@@ -1894,12 +1918,18 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
to = PAGE_CACHE_SIZE;
}
- if (ocfs2_should_order_data(inode))
- walk_page_buffers(wc->w_handle, page_buffers(tmppage),
- from, to, NULL,
- ocfs2_journal_dirty_data);
-
- block_commit_write(tmppage, from, to);
+ if (page_has_buffers(tmppage)) {
+ if (ocfs2_should_order_data(inode)) {
+ ocfs2_jbd2_file_inode(wc->w_handle, inode);
+#ifdef CONFIG_OCFS2_COMPAT_JBD
+ walk_page_buffers(wc->w_handle,
+ page_buffers(tmppage),
+ from, to, NULL,
+ ocfs2_journal_dirty_data);
+#endif
+ }
+ block_commit_write(tmppage, from, to);
+ }
}
out_write_size:
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index f136639f5b4..7e947c67246 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -66,7 +66,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
/* remove from dirty list before I/O. */
clear_buffer_dirty(bh);
- get_bh(bh); /* for end_buffer_write_sync() */
+ get_bh(bh); /* for end_buffer_write_sync() */
bh->b_end_io = end_buffer_write_sync;
submit_bh(WRITE, bh);
@@ -88,22 +88,103 @@ out:
return ret;
}
-int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
- struct buffer_head *bhs[], int flags,
- struct inode *inode)
+int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
+ unsigned int nr, struct buffer_head *bhs[])
+{
+ int status = 0;
+ unsigned int i;
+ struct buffer_head *bh;
+
+ if (!nr) {
+ mlog(ML_BH_IO, "No buffers will be read!\n");
+ goto bail;
+ }
+
+ for (i = 0 ; i < nr ; i++) {
+ if (bhs[i] == NULL) {
+ bhs[i] = sb_getblk(osb->sb, block++);
+ if (bhs[i] == NULL) {
+ status = -EIO;
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+ bh = bhs[i];
+
+ if (buffer_jbd(bh)) {
+ mlog(ML_ERROR,
+ "trying to sync read a jbd "
+ "managed bh (blocknr = %llu), skipping\n",
+ (unsigned long long)bh->b_blocknr);
+ continue;
+ }
+
+ if (buffer_dirty(bh)) {
+ /* This should probably be a BUG, or
+ * at least return an error. */
+ mlog(ML_ERROR,
+ "trying to sync read a dirty "
+ "buffer! (blocknr = %llu), skipping\n",
+ (unsigned long long)bh->b_blocknr);
+ continue;
+ }
+
+ lock_buffer(bh);
+ if (buffer_jbd(bh)) {
+ mlog(ML_ERROR,
+ "block %llu had the JBD bit set "
+ "while I was in lock_buffer!",
+ (unsigned long long)bh->b_blocknr);
+ BUG();
+ }
+
+ clear_buffer_uptodate(bh);
+ get_bh(bh); /* for end_buffer_read_sync() */
+ bh->b_end_io = end_buffer_read_sync;
+ submit_bh(READ, bh);
+ }
+
+ for (i = nr; i > 0; i--) {
+ bh = bhs[i - 1];
+
+ if (buffer_jbd(bh)) {
+ mlog(ML_ERROR,
+ "the journal got the buffer while it was "
+ "locked for io! (blocknr = %llu)\n",
+ (unsigned long long)bh->b_blocknr);
+ BUG();
+ }
+
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh)) {
+ /* Status won't be cleared from here on out,
+ * so we can safely record this and loop back
+ * to cleanup the other buffers. */
+ status = -EIO;
+ put_bh(bh);
+ bhs[i - 1] = NULL;
+ }
+ }
+
+bail:
+ return status;
+}
+
+int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
+ struct buffer_head *bhs[], int flags)
{
int status = 0;
- struct super_block *sb;
int i, ignore_cache = 0;
struct buffer_head *bh;
- mlog_entry("(block=(%llu), nr=(%d), flags=%d, inode=%p)\n",
- (unsigned long long)block, nr, flags, inode);
+ mlog_entry("(inode=%p, block=(%llu), nr=(%d), flags=%d)\n",
+ inode, (unsigned long long)block, nr, flags);
+ BUG_ON(!inode);
BUG_ON((flags & OCFS2_BH_READAHEAD) &&
- (!inode || !(flags & OCFS2_BH_CACHED)));
+ (flags & OCFS2_BH_IGNORE_CACHE));
- if (osb == NULL || osb->sb == NULL || bhs == NULL) {
+ if (bhs == NULL) {
status = -EINVAL;
mlog_errno(status);
goto bail;
@@ -122,26 +203,19 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
goto bail;
}
- sb = osb->sb;
-
- if (flags & OCFS2_BH_CACHED && !inode)
- flags &= ~OCFS2_BH_CACHED;
-
- if (inode)
- mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
+ mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
for (i = 0 ; i < nr ; i++) {
if (bhs[i] == NULL) {
- bhs[i] = sb_getblk(sb, block++);
+ bhs[i] = sb_getblk(inode->i_sb, block++);
if (bhs[i] == NULL) {
- if (inode)
- mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
+ mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
status = -EIO;
mlog_errno(status);
goto bail;
}
}
bh = bhs[i];
- ignore_cache = 0;
+ ignore_cache = (flags & OCFS2_BH_IGNORE_CACHE);
/* There are three read-ahead cases here which we need to
* be concerned with. All three assume a buffer has
@@ -167,26 +241,27 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
* before our is-it-in-flight check.
*/
- if (flags & OCFS2_BH_CACHED &&
- !ocfs2_buffer_uptodate(inode, bh)) {
+ if (!ignore_cache && !ocfs2_buffer_uptodate(inode, bh)) {
mlog(ML_UPTODATE,
"bh (%llu), inode %llu not uptodate\n",
(unsigned long long)bh->b_blocknr,
(unsigned long long)OCFS2_I(inode)->ip_blkno);
+ /* We're using ignore_cache here to say
+ * "go to disk" */
ignore_cache = 1;
}
/* XXX: Can we ever get this and *not* have the cached
* flag set? */
if (buffer_jbd(bh)) {
- if (!(flags & OCFS2_BH_CACHED) || ignore_cache)
+ if (ignore_cache)
mlog(ML_BH_IO, "trying to sync read a jbd "
"managed bh (blocknr = %llu)\n",
(unsigned long long)bh->b_blocknr);
continue;
}
- if (!(flags & OCFS2_BH_CACHED) || ignore_cache) {
+ if (ignore_cache) {
if (buffer_dirty(bh)) {
/* This should probably be a BUG, or
* at least return an error. */
@@ -221,7 +296,7 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
* previously read-ahead buffer may have
* completed I/O while we were waiting for the
* buffer lock. */
- if ((flags & OCFS2_BH_CACHED)
+ if (!(flags & OCFS2_BH_IGNORE_CACHE)
&& !(flags & OCFS2_BH_READAHEAD)
&& ocfs2_buffer_uptodate(inode, bh)) {
unlock_buffer(bh);
@@ -265,15 +340,14 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
/* Always set the buffer in the cache, even if it was
* a forced read, or read-ahead which hasn't yet
* completed. */
- if (inode)
- ocfs2_set_buffer_uptodate(inode, bh);
+ ocfs2_set_buffer_uptodate(inode, bh);
}
- if (inode)
- mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
+ mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n",
(unsigned long long)block, nr,
- (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes", flags);
+ ((flags & OCFS2_BH_IGNORE_CACHE) || ignore_cache) ? "no" : "yes",
+ flags);
bail:
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index c2e78614c3e..75e1dcb1ade 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -31,31 +31,29 @@
void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
int uptodate);
-static inline int ocfs2_read_block(struct ocfs2_super *osb,
+static inline int ocfs2_read_block(struct inode *inode,
u64 off,
- struct buffer_head **bh,
- int flags,
- struct inode *inode);
+ struct buffer_head **bh);
int ocfs2_write_block(struct ocfs2_super *osb,
struct buffer_head *bh,
struct inode *inode);
-int ocfs2_read_blocks(struct ocfs2_super *osb,
+int ocfs2_read_blocks(struct inode *inode,
u64 block,
int nr,
struct buffer_head *bhs[],
- int flags,
- struct inode *inode);
+ int flags);
+int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
+ unsigned int nr, struct buffer_head *bhs[]);
int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
struct buffer_head *bh);
-#define OCFS2_BH_CACHED 1
+#define OCFS2_BH_IGNORE_CACHE 1
#define OCFS2_BH_READAHEAD 8
-static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off,
- struct buffer_head **bh, int flags,
- struct inode *inode)
+static inline int ocfs2_read_block(struct inode *inode, u64 off,
+ struct buffer_head **bh)
{
int status = 0;
@@ -65,8 +63,7 @@ static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off,
goto bail;
}
- status = ocfs2_read_blocks(osb, off, 1, bh,
- flags, inode);
+ status = ocfs2_read_blocks(inode, off, 1, bh, 0);
bail:
return status;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index f02ccb34604..7dce1612553 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1493,24 +1493,18 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g
const char *name)
{
struct o2hb_region *reg = NULL;
- struct config_item *ret = NULL;
reg = kzalloc(sizeof(struct o2hb_region), GFP_KERNEL);
if (reg == NULL)
- goto out; /* ENOMEM */
+ return ERR_PTR(-ENOMEM);
config_item_init_type_name(&reg->hr_item, name, &o2hb_region_type);
- ret = &reg->hr_item;
-
spin_lock(&o2hb_live_lock);
list_add_tail(&reg->hr_all_item, &o2hb_all_regions);
spin_unlock(&o2hb_live_lock);
-out:
- if (ret == NULL)
- kfree(reg);
- return ret;
+ return &reg->hr_item;
}
static void o2hb_heartbeat_group_drop_item(struct config_group *group,
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 23c732f2752..d8a0cb92cef 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -109,6 +109,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
define_mask(CONN),
define_mask(QUORUM),
define_mask(EXPORT),
+ define_mask(XATTR),
define_mask(ERROR),
define_mask(NOTICE),
define_mask(KTHREAD),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 597e064bb94..57670c68047 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -112,6 +112,7 @@
#define ML_CONN 0x0000000004000000ULL /* net connection management */
#define ML_QUORUM 0x0000000008000000ULL /* net connection quorum */
#define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */
+#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */
/* bits that are infrequently given and frequently matched in the high word */
#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */
#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index 7bf3c0ea7bd..52276c02f71 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -138,18 +138,20 @@ static int nst_seq_show(struct seq_file *seq, void *v)
" message id: %d\n"
" message type: %u\n"
" message key: 0x%08x\n"
- " sock acquiry: %lu.%lu\n"
- " send start: %lu.%lu\n"
- " wait start: %lu.%lu\n",
+ " sock acquiry: %lu.%ld\n"
+ " send start: %lu.%ld\n"
+ " wait start: %lu.%ld\n",
nst, (unsigned long)nst->st_task->pid,
(unsigned long)nst->st_task->tgid,
nst->st_task->comm, nst->st_node,
nst->st_sc, nst->st_id, nst->st_msg_type,
nst->st_msg_key,
- nst->st_sock_time.tv_sec, nst->st_sock_time.tv_usec,
- nst->st_send_time.tv_sec, nst->st_send_time.tv_usec,
+ nst->st_sock_time.tv_sec,
+ (long)nst->st_sock_time.tv_usec,
+ nst->st_send_time.tv_sec,
+ (long)nst->st_send_time.tv_usec,
nst->st_status_time.tv_sec,
- nst->st_status_time.tv_usec);
+ (long)nst->st_status_time.tv_usec);
}
spin_unlock(&o2net_debug_lock);
@@ -274,7 +276,7 @@ static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
return sc; /* unused, just needs to be null when done */
}
-#define TV_SEC_USEC(TV) TV.tv_sec, TV.tv_usec
+#define TV_SEC_USEC(TV) TV.tv_sec, (long)TV.tv_usec
static int sc_seq_show(struct seq_file *seq, void *v)
{
@@ -307,12 +309,12 @@ static int sc_seq_show(struct seq_file *seq, void *v)
" remote node: %s\n"
" page off: %zu\n"
" handshake ok: %u\n"
- " timer: %lu.%lu\n"
- " data ready: %lu.%lu\n"
- " advance start: %lu.%lu\n"
- " advance stop: %lu.%lu\n"
- " func start: %lu.%lu\n"
- " func stop: %lu.%lu\n"
+ " timer: %lu.%ld\n"
+ " data ready: %lu.%ld\n"
+ " advance start: %lu.%ld\n"
+ " advance stop: %lu.%ld\n"
+ " func start: %lu.%ld\n"
+ " func stop: %lu.%ld\n"
" func key: %u\n"
" func type: %u\n",
sc,
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index cfdb08b484e..816a3f61330 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -648,26 +648,19 @@ static struct config_item *o2nm_node_group_make_item(struct config_group *group,
const char *name)
{
struct o2nm_node *node = NULL;
- struct config_item *ret = NULL;
if (strlen(name) > O2NM_MAX_NAME_LEN)
- goto out; /* ENAMETOOLONG */
+ return ERR_PTR(-ENAMETOOLONG);
node = kzalloc(sizeof(struct o2nm_node), GFP_KERNEL);
if (node == NULL)
- goto out; /* ENOMEM */
+ return ERR_PTR(-ENOMEM);
strcpy(node->nd_name, name); /* use item.ci_namebuf instead? */
config_item_init_type_name(&node->nd_item, name, &o2nm_node_type);
spin_lock_init(&node->nd_lock);
- ret = &node->nd_item;
-
-out:
- if (ret == NULL)
- kfree(node);
-
- return ret;
+ return &node->nd_item;
}
static void o2nm_node_group_drop_item(struct config_group *group,
@@ -762,7 +755,7 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g
/* this runs under the parent dir's i_mutex; there can be only
* one caller in here at a time */
if (o2nm_single_cluster)
- goto out; /* ENOSPC */
+ return ERR_PTR(-ENOSPC);
cluster = kzalloc(sizeof(struct o2nm_cluster), GFP_KERNEL);
ns = kzalloc(sizeof(struct o2nm_node_group), GFP_KERNEL);
@@ -795,6 +788,7 @@ out:
kfree(ns);
o2hb_free_hb_set(o2hb_group);
kfree(defs);
+ ret = ERR_PTR(-ENOMEM);
}
return ret;
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index a27d61581bd..2bcf706d9dd 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -143,8 +143,8 @@ static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc);
#ifdef CONFIG_DEBUG_FS
-void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
- u32 msgkey, struct task_struct *task, u8 node)
+static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
+ u32 msgkey, struct task_struct *task, u8 node)
{
INIT_LIST_HEAD(&nst->st_net_debug_item);
nst->st_task = task;
@@ -153,31 +153,61 @@ void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
nst->st_node = node;
}
-void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
{
do_gettimeofday(&nst->st_sock_time);
}
-void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+static void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
{
do_gettimeofday(&nst->st_send_time);
}
-void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+static void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
{
do_gettimeofday(&nst->st_status_time);
}
-void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
struct o2net_sock_container *sc)
{
nst->st_sc = sc;
}
-void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id)
+static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id)
{
nst->st_id = msg_id;
}
+
+#else /* CONFIG_DEBUG_FS */
+
+static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
+ u32 msgkey, struct task_struct *task, u8 node)
+{
+}
+
+static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+{
+}
+
+static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+{
+}
+
+static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+{
+}
+
+static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+ struct o2net_sock_container *sc)
+{
+}
+
+static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
+ u32 msg_id)
+{
+}
+
#endif /* CONFIG_DEBUG_FS */
static inline int o2net_reconnect_delay(void)
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 18307ff81b7..8d58cfe410b 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -224,42 +224,10 @@ struct o2net_send_tracking {
struct timeval st_send_time;
struct timeval st_status_time;
};
-
-void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
- u32 msgkey, struct task_struct *task, u8 node);
-void o2net_set_nst_sock_time(struct o2net_send_tracking *nst);
-void o2net_set_nst_send_time(struct o2net_send_tracking *nst);
-void o2net_set_nst_status_time(struct o2net_send_tracking *nst);
-void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
- struct o2net_sock_container *sc);
-void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id);
-
#else
struct o2net_send_tracking {
u32 dummy;
};
-
-static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
- u32 msgkey, struct task_struct *task, u8 node)
-{
-}
-static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
-{
-}
-static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
-{
-}
-static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
-{
-}
-static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
- struct o2net_sock_container *sc)
-{
-}
-static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
- u32 msg_id)
-{
-}
#endif /* CONFIG_DEBUG_FS */
#endif /* O2CLUSTER_TCP_INTERNAL_H */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 8a187584808..026e6eb8518 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -82,6 +82,49 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
struct ocfs2_alloc_context *meta_ac,
struct buffer_head **new_bh);
+static struct buffer_head *ocfs2_bread(struct inode *inode,
+ int block, int *err, int reada)
+{
+ struct buffer_head *bh = NULL;
+ int tmperr;
+ u64 p_blkno;
+ int readflags = 0;
+
+ if (reada)
+ readflags |= OCFS2_BH_READAHEAD;
+
+ if (((u64)block << inode->i_sb->s_blocksize_bits) >=
+ i_size_read(inode)) {
+ BUG_ON(!reada);
+ return NULL;
+ }
+
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
+ tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
+ NULL);
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
+ if (tmperr < 0) {
+ mlog_errno(tmperr);
+ goto fail;
+ }
+
+ tmperr = ocfs2_read_blocks(inode, p_blkno, 1, &bh, readflags);
+ if (tmperr < 0)
+ goto fail;
+
+ tmperr = 0;
+
+ *err = 0;
+ return bh;
+
+fail:
+ brelse(bh);
+ bh = NULL;
+
+ *err = -EIO;
+ return NULL;
+}
+
/*
* bh passed here can be an inode block or a dir data block, depending
* on the inode inline data flag.
@@ -188,8 +231,7 @@ static struct buffer_head *ocfs2_find_entry_id(const char *name,
struct ocfs2_dinode *di;
struct ocfs2_inline_data *data;
- ret = ocfs2_read_block(OCFS2_SB(dir->i_sb), OCFS2_I(dir)->ip_blkno,
- &di_bh, OCFS2_BH_CACHED, dir);
+ ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -260,14 +302,13 @@ restart:
}
if ((bh = bh_use[ra_ptr++]) == NULL)
goto next;
- wait_on_buffer(bh);
- if (!buffer_uptodate(bh)) {
- /* read error, skip block & hope for the best */
+ if (ocfs2_read_block(dir, block, &bh)) {
+ /* read error, skip block & hope for the best.
+ * ocfs2_read_block() has released the bh. */
ocfs2_error(dir->i_sb, "reading directory %llu, "
"offset %lu\n",
(unsigned long long)OCFS2_I(dir)->ip_blkno,
block);
- brelse(bh);
goto next;
}
i = ocfs2_search_dirblock(bh, dir, name, namelen,
@@ -417,8 +458,7 @@ static inline int ocfs2_delete_entry_id(handle_t *handle,
struct ocfs2_dinode *di;
struct ocfs2_inline_data *data;
- ret = ocfs2_read_block(OCFS2_SB(dir->i_sb), OCFS2_I(dir)->ip_blkno,
- &di_bh, OCFS2_BH_CACHED, dir);
+ ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -596,8 +636,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode,
struct ocfs2_inline_data *data;
struct ocfs2_dir_entry *de;
- ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
- &di_bh, OCFS2_BH_CACHED, inode);
+ ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
if (ret) {
mlog(ML_ERROR, "Unable to read inode block for dir %llu\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -716,8 +755,7 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
for (i = ra_sectors >> (sb->s_blocksize_bits - 9);
i > 0; i--) {
tmp = ocfs2_bread(inode, ++blk, &err, 1);
- if (tmp)
- brelse(tmp);
+ brelse(tmp);
}
last_ra_blk = blk;
ra_sectors = 8;
@@ -899,10 +937,8 @@ int ocfs2_find_files_on_disk(const char *name,
leave:
if (status < 0) {
*dirent = NULL;
- if (*dirent_bh) {
- brelse(*dirent_bh);
- *dirent_bh = NULL;
- }
+ brelse(*dirent_bh);
+ *dirent_bh = NULL;
}
mlog_exit(status);
@@ -951,8 +987,7 @@ int ocfs2_check_dir_for_entry(struct inode *dir,
ret = 0;
bail:
- if (dirent_bh)
- brelse(dirent_bh);
+ brelse(dirent_bh);
mlog_exit(ret);
return ret;
@@ -1127,8 +1162,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
status = 0;
bail:
- if (new_bh)
- brelse(new_bh);
+ brelse(new_bh);
mlog_exit(status);
return status;
@@ -1192,6 +1226,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
struct buffer_head *dirdata_bh = NULL;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
handle_t *handle;
+ struct ocfs2_extent_tree et;
+
+ ocfs2_init_dinode_extent_tree(&et, dir, di_bh);
alloc = ocfs2_clusters_for_bytes(sb, bytes);
@@ -1300,19 +1337,24 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
di->i_size = cpu_to_le64(sb->s_blocksize);
di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec);
di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec);
- dir->i_blocks = ocfs2_inode_sector_count(dir);
/*
* This should never fail as our extent list is empty and all
* related blocks have been journaled already.
*/
- ret = ocfs2_insert_extent(osb, handle, dir, di_bh, 0, blkno, len, 0,
- NULL);
+ ret = ocfs2_insert_extent(osb, handle, dir, &et, 0, blkno, len,
+ 0, NULL);
if (ret) {
mlog_errno(ret);
- goto out;
+ goto out_commit;
}
+ /*
+ * Set i_blocks after the extent insert for the most up to
+ * date ip_clusters value.
+ */
+ dir->i_blocks = ocfs2_inode_sector_count(dir);
+
ret = ocfs2_journal_dirty(handle, di_bh);
if (ret) {
mlog_errno(ret);
@@ -1332,11 +1374,11 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
}
blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
- ret = ocfs2_insert_extent(osb, handle, dir, di_bh, 1, blkno,
- len, 0, NULL);
+ ret = ocfs2_insert_extent(osb, handle, dir, &et, 1,
+ blkno, len, 0, NULL);
if (ret) {
mlog_errno(ret);
- goto out;
+ goto out_commit;
}
}
@@ -1378,9 +1420,9 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
if (extend) {
u32 offset = OCFS2_I(dir)->ip_clusters;
- status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset,
- 1, 0, parent_fe_bh, handle,
- data_ac, meta_ac, NULL);
+ status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset,
+ 1, 0, parent_fe_bh, handle,
+ data_ac, meta_ac, NULL);
BUG_ON(status == -EAGAIN);
if (status < 0) {
mlog_errno(status);
@@ -1425,12 +1467,14 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
int credits, num_free_extents, drop_alloc_sem = 0;
loff_t dir_i_size;
struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
+ struct ocfs2_extent_list *el = &fe->id2.i_list;
struct ocfs2_alloc_context *data_ac = NULL;
struct ocfs2_alloc_context *meta_ac = NULL;
handle_t *handle = NULL;
struct buffer_head *new_bh = NULL;
struct ocfs2_dir_entry * de;
struct super_block *sb = osb->sb;
+ struct ocfs2_extent_tree et;
mlog_entry_void();
@@ -1474,7 +1518,8 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
spin_lock(&OCFS2_I(dir)->ip_lock);
if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) {
spin_unlock(&OCFS2_I(dir)->ip_lock);
- num_free_extents = ocfs2_num_free_extents(osb, dir, fe);
+ ocfs2_init_dinode_extent_tree(&et, dir, parent_fe_bh);
+ num_free_extents = ocfs2_num_free_extents(osb, dir, &et);
if (num_free_extents < 0) {
status = num_free_extents;
mlog_errno(status);
@@ -1482,7 +1527,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
}
if (!num_free_extents) {
- status = ocfs2_reserve_new_metadata(osb, fe, &meta_ac);
+ status = ocfs2_reserve_new_metadata(osb, el, &meta_ac);
if (status < 0) {
if (status != -ENOSPC)
mlog_errno(status);
@@ -1497,7 +1542,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
goto bail;
}
- credits = ocfs2_calc_extend_credits(sb, fe, 1);
+ credits = ocfs2_calc_extend_credits(sb, el, 1);
} else {
spin_unlock(&OCFS2_I(dir)->ip_lock);
credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
@@ -1563,8 +1608,7 @@ bail:
if (meta_ac)
ocfs2_free_alloc_context(meta_ac);
- if (new_bh)
- brelse(new_bh);
+ brelse(new_bh);
mlog_exit(status);
return status;
@@ -1691,8 +1735,7 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
status = 0;
bail:
- if (bh)
- brelse(bh);
+ brelse(bh);
mlog_exit(status);
return status;
@@ -1751,7 +1794,6 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
*ret_de_bh = bh;
bh = NULL;
out:
- if (bh)
- brelse(bh);
+ brelse(bh);
return ret;
}
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index e48aba698b7..533a789c3ef 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -267,8 +267,7 @@ static ssize_t dlmfs_file_write(struct file *filp,
return writelen;
}
-static void dlmfs_init_once(struct kmem_cache *cachep,
- void *foo)
+static void dlmfs_init_once(void *foo)
{
struct dlmfs_inode_private *ip =
(struct dlmfs_inode_private *) foo;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index efc015c6128..44f87caf368 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -606,7 +606,9 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
res->last_used = 0;
+ spin_lock(&dlm->spinlock);
list_add_tail(&res->tracking, &dlm->tracking_list);
+ spin_unlock(&dlm->spinlock);
memset(res->lvb, 0, DLM_LVB_LEN);
memset(res->refmap, 0, sizeof(res->refmap));
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 394d25a131a..ec684426034 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -31,6 +31,7 @@
#include <linux/pagemap.h>
#include <linux/debugfs.h>
#include <linux/seq_file.h>
+#include <linux/time.h>
#define MLOG_MASK_PREFIX ML_DLM_GLUE
#include <cluster/masklog.h>
@@ -59,6 +60,9 @@ struct ocfs2_mask_waiter {
struct completion mw_complete;
unsigned long mw_mask;
unsigned long mw_goal;
+#ifdef CONFIG_OCFS2_FS_STATS
+ unsigned long long mw_lock_start;
+#endif
};
static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres);
@@ -366,6 +370,75 @@ static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res)
spin_unlock(&ocfs2_dlm_tracking_lock);
}
+#ifdef CONFIG_OCFS2_FS_STATS
+static void ocfs2_init_lock_stats(struct ocfs2_lock_res *res)
+{
+ res->l_lock_num_prmode = 0;
+ res->l_lock_num_prmode_failed = 0;
+ res->l_lock_total_prmode = 0;
+ res->l_lock_max_prmode = 0;
+ res->l_lock_num_exmode = 0;
+ res->l_lock_num_exmode_failed = 0;
+ res->l_lock_total_exmode = 0;
+ res->l_lock_max_exmode = 0;
+ res->l_lock_refresh = 0;
+}
+
+static void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, int level,
+ struct ocfs2_mask_waiter *mw, int ret)
+{
+ unsigned long long *num, *sum;
+ unsigned int *max, *failed;
+ struct timespec ts = current_kernel_time();
+ unsigned long long time = timespec_to_ns(&ts) - mw->mw_lock_start;
+
+ if (level == LKM_PRMODE) {
+ num = &res->l_lock_num_prmode;
+ sum = &res->l_lock_total_prmode;
+ max = &res->l_lock_max_prmode;
+ failed = &res->l_lock_num_prmode_failed;
+ } else if (level == LKM_EXMODE) {
+ num = &res->l_lock_num_exmode;
+ sum = &res->l_lock_total_exmode;
+ max = &res->l_lock_max_exmode;
+ failed = &res->l_lock_num_exmode_failed;
+ } else
+ return;
+
+ (*num)++;
+ (*sum) += time;
+ if (time > *max)
+ *max = time;
+ if (ret)
+ (*failed)++;
+}
+
+static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
+{
+ lockres->l_lock_refresh++;
+}
+
+static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw)
+{
+ struct timespec ts = current_kernel_time();
+ mw->mw_lock_start = timespec_to_ns(&ts);
+}
+#else
+static inline void ocfs2_init_lock_stats(struct ocfs2_lock_res *res)
+{
+}
+static inline void ocfs2_update_lock_stats(struct ocfs2_lock_res *res,
+ int level, struct ocfs2_mask_waiter *mw, int ret)
+{
+}
+static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
+{
+}
+static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw)
+{
+}
+#endif
+
static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
struct ocfs2_lock_res *res,
enum ocfs2_lock_type type,
@@ -385,6 +458,8 @@ static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
res->l_flags = OCFS2_LOCK_INITIALIZED;
ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug);
+
+ ocfs2_init_lock_stats(res);
}
void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res)
@@ -1048,6 +1123,7 @@ static void ocfs2_init_mask_waiter(struct ocfs2_mask_waiter *mw)
{
INIT_LIST_HEAD(&mw->mw_item);
init_completion(&mw->mw_complete);
+ ocfs2_init_start_time(mw);
}
static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw)
@@ -1254,6 +1330,7 @@ out:
goto again;
mlog_errno(ret);
}
+ ocfs2_update_lock_stats(lockres, level, &mw, ret);
mlog_exit(ret);
return ret;
@@ -1554,8 +1631,8 @@ out:
*/
int ocfs2_file_lock(struct file *file, int ex, int trylock)
{
- int ret, level = ex ? LKM_EXMODE : LKM_PRMODE;
- unsigned int lkm_flags = trylock ? LKM_NOQUEUE : 0;
+ int ret, level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+ unsigned int lkm_flags = trylock ? DLM_LKF_NOQUEUE : 0;
unsigned long flags;
struct ocfs2_file_private *fp = file->private_data;
struct ocfs2_lock_res *lockres = &fp->fp_flock;
@@ -1582,7 +1659,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
* Get the lock at NLMODE to start - that way we
* can cancel the upconvert request if need be.
*/
- ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
+ ret = ocfs2_lock_create(osb, lockres, DLM_LOCK_NL, 0);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -1597,7 +1674,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
}
lockres->l_action = OCFS2_AST_CONVERT;
- lkm_flags |= LKM_CONVERT;
+ lkm_flags |= DLM_LKF_CONVERT;
lockres->l_requested = level;
lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
@@ -1664,7 +1741,7 @@ void ocfs2_file_unlock(struct file *file)
if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED))
return;
- if (lockres->l_level == LKM_NLMODE)
+ if (lockres->l_level == DLM_LOCK_NL)
return;
mlog(0, "Unlock: \"%s\" flags: 0x%lx, level: %d, act: %d\n",
@@ -1678,11 +1755,11 @@ void ocfs2_file_unlock(struct file *file)
lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
lockres->l_blocking = DLM_LOCK_EX;
- gen = ocfs2_prepare_downconvert(lockres, LKM_NLMODE);
+ gen = ocfs2_prepare_downconvert(lockres, DLM_LOCK_NL);
lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
spin_unlock_irqrestore(&lockres->l_lock, flags);
- ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0, gen);
+ ret = ocfs2_downconvert_lock(osb, lockres, DLM_LOCK_NL, 0, gen);
if (ret) {
mlog_errno(ret);
return;
@@ -1947,8 +2024,7 @@ static int ocfs2_inode_lock_update(struct inode *inode,
} else {
/* Boo, we have to go to disk. */
/* read bh, cast, ocfs2_refresh_inode */
- status = ocfs2_read_block(OCFS2_SB(inode->i_sb), oi->ip_blkno,
- bh, OCFS2_BH_CACHED, inode);
+ status = ocfs2_read_block(inode, oi->ip_blkno, bh);
if (status < 0) {
mlog_errno(status);
goto bail_refresh;
@@ -1983,6 +2059,7 @@ static int ocfs2_inode_lock_update(struct inode *inode,
le32_to_cpu(fe->i_flags));
ocfs2_refresh_inode(inode, fe);
+ ocfs2_track_lock_refresh(lockres);
}
status = 0;
@@ -2008,11 +2085,7 @@ static int ocfs2_assign_bh(struct inode *inode,
return 0;
}
- status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
- OCFS2_I(inode)->ip_blkno,
- ret_bh,
- OCFS2_BH_CACHED,
- inode);
+ status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, ret_bh);
if (status < 0)
mlog_errno(status);
@@ -2267,6 +2340,7 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
if (status < 0)
mlog_errno(status);
+ ocfs2_track_lock_refresh(lockres);
}
bail:
mlog_exit(status);
@@ -2461,7 +2535,7 @@ static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos)
}
/* So that debugfs.ocfs2 can determine which format is being used */
-#define OCFS2_DLM_DEBUG_STR_VERSION 1
+#define OCFS2_DLM_DEBUG_STR_VERSION 2
static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
{
int i;
@@ -2502,6 +2576,47 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
for(i = 0; i < DLM_LVB_LEN; i++)
seq_printf(m, "0x%x\t", lvb[i]);
+#ifdef CONFIG_OCFS2_FS_STATS
+# define lock_num_prmode(_l) (_l)->l_lock_num_prmode
+# define lock_num_exmode(_l) (_l)->l_lock_num_exmode
+# define lock_num_prmode_failed(_l) (_l)->l_lock_num_prmode_failed
+# define lock_num_exmode_failed(_l) (_l)->l_lock_num_exmode_failed
+# define lock_total_prmode(_l) (_l)->l_lock_total_prmode
+# define lock_total_exmode(_l) (_l)->l_lock_total_exmode
+# define lock_max_prmode(_l) (_l)->l_lock_max_prmode
+# define lock_max_exmode(_l) (_l)->l_lock_max_exmode
+# define lock_refresh(_l) (_l)->l_lock_refresh
+#else
+# define lock_num_prmode(_l) (0ULL)
+# define lock_num_exmode(_l) (0ULL)
+# define lock_num_prmode_failed(_l) (0)
+# define lock_num_exmode_failed(_l) (0)
+# define lock_total_prmode(_l) (0ULL)
+# define lock_total_exmode(_l) (0ULL)
+# define lock_max_prmode(_l) (0)
+# define lock_max_exmode(_l) (0)
+# define lock_refresh(_l) (0)
+#endif
+ /* The following seq_print was added in version 2 of this output */
+ seq_printf(m, "%llu\t"
+ "%llu\t"
+ "%u\t"
+ "%u\t"
+ "%llu\t"
+ "%llu\t"
+ "%u\t"
+ "%u\t"
+ "%u\t",
+ lock_num_prmode(lockres),
+ lock_num_exmode(lockres),
+ lock_num_prmode_failed(lockres),
+ lock_num_exmode_failed(lockres),
+ lock_total_prmode(lockres),
+ lock_total_exmode(lockres),
+ lock_max_prmode(lockres),
+ lock_max_exmode(lockres),
+ lock_refresh(lockres));
+
/* End the line */
seq_printf(m, "\n");
return 0;
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index c58668a326f..2baedac5823 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -25,6 +25,7 @@
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/types.h>
+#include <linux/fiemap.h>
#define MLOG_MASK_PREFIX ML_EXTENT_MAP
#include <cluster/masklog.h>
@@ -32,6 +33,7 @@
#include "ocfs2.h"
#include "alloc.h"
+#include "dlmglue.h"
#include "extent_map.h"
#include "inode.h"
#include "super.h"
@@ -282,6 +284,50 @@ out:
kfree(new_emi);
}
+static int ocfs2_last_eb_is_empty(struct inode *inode,
+ struct ocfs2_dinode *di)
+{
+ int ret, next_free;
+ u64 last_eb_blk = le64_to_cpu(di->i_last_eb_blk);
+ struct buffer_head *eb_bh = NULL;
+ struct ocfs2_extent_block *eb;
+ struct ocfs2_extent_list *el;
+
+ ret = ocfs2_read_block(inode, last_eb_blk, &eb_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+ el = &eb->h_list;
+
+ if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+ ret = -EROFS;
+ OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+ goto out;
+ }
+
+ if (el->l_tree_depth) {
+ ocfs2_error(inode->i_sb,
+ "Inode %lu has non zero tree depth in "
+ "leaf block %llu\n", inode->i_ino,
+ (unsigned long long)eb_bh->b_blocknr);
+ ret = -EROFS;
+ goto out;
+ }
+
+ next_free = le16_to_cpu(el->l_next_free_rec);
+
+ if (next_free == 0 ||
+ (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0])))
+ ret = 1;
+
+out:
+ brelse(eb_bh);
+ return ret;
+}
+
/*
* Return the 1st index within el which contains an extent start
* larger than v_cluster.
@@ -335,9 +381,9 @@ static int ocfs2_figure_hole_clusters(struct inode *inode,
if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
goto no_more_extents;
- ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+ ret = ocfs2_read_block(inode,
le64_to_cpu(eb->h_next_leaf_blk),
- &next_eb_bh, OCFS2_BH_CACHED, inode);
+ &next_eb_bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -373,42 +419,28 @@ out:
return ret;
}
-int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
- u32 *p_cluster, u32 *num_clusters,
- unsigned int *extent_flags)
+static int ocfs2_get_clusters_nocache(struct inode *inode,
+ struct buffer_head *di_bh,
+ u32 v_cluster, unsigned int *hole_len,
+ struct ocfs2_extent_rec *ret_rec,
+ unsigned int *is_last)
{
- int ret, i;
- unsigned int flags = 0;
- struct buffer_head *di_bh = NULL;
- struct buffer_head *eb_bh = NULL;
+ int i, ret, tree_height, len;
struct ocfs2_dinode *di;
- struct ocfs2_extent_block *eb;
+ struct ocfs2_extent_block *uninitialized_var(eb);
struct ocfs2_extent_list *el;
struct ocfs2_extent_rec *rec;
- u32 coff;
-
- if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
- ret = -ERANGE;
- mlog_errno(ret);
- goto out;
- }
-
- ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
- num_clusters, extent_flags);
- if (ret == 0)
- goto out;
+ struct buffer_head *eb_bh = NULL;
- ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
- &di_bh, OCFS2_BH_CACHED, inode);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
+ memset(ret_rec, 0, sizeof(*ret_rec));
+ if (is_last)
+ *is_last = 0;
di = (struct ocfs2_dinode *) di_bh->b_data;
el = &di->id2.i_list;
+ tree_height = le16_to_cpu(el->l_tree_depth);
- if (el->l_tree_depth) {
+ if (tree_height > 0) {
ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh);
if (ret) {
mlog_errno(ret);
@@ -431,46 +463,202 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
i = ocfs2_search_extent_list(el, v_cluster);
if (i == -1) {
/*
- * A hole was found. Return some canned values that
- * callers can key on. If asked for, num_clusters will
- * be populated with the size of the hole.
+ * Holes can be larger than the maximum size of an
+ * extent, so we return their lengths in a seperate
+ * field.
*/
- *p_cluster = 0;
- if (num_clusters) {
+ if (hole_len) {
ret = ocfs2_figure_hole_clusters(inode, el, eb_bh,
- v_cluster,
- num_clusters);
+ v_cluster, &len);
if (ret) {
mlog_errno(ret);
goto out;
}
+
+ *hole_len = len;
+ }
+ goto out_hole;
+ }
+
+ rec = &el->l_recs[i];
+
+ BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
+
+ if (!rec->e_blkno) {
+ ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+ "record (%u, %u, 0)", inode->i_ino,
+ le32_to_cpu(rec->e_cpos),
+ ocfs2_rec_clusters(el, rec));
+ ret = -EROFS;
+ goto out;
+ }
+
+ *ret_rec = *rec;
+
+ /*
+ * Checking for last extent is potentially expensive - we
+ * might have to look at the next leaf over to see if it's
+ * empty.
+ *
+ * The first two checks are to see whether the caller even
+ * cares for this information, and if the extent is at least
+ * the last in it's list.
+ *
+ * If those hold true, then the extent is last if any of the
+ * additional conditions hold true:
+ * - Extent list is in-inode
+ * - Extent list is right-most
+ * - Extent list is 2nd to rightmost, with empty right-most
+ */
+ if (is_last) {
+ if (i == (le16_to_cpu(el->l_next_free_rec) - 1)) {
+ if (tree_height == 0)
+ *is_last = 1;
+ else if (eb->h_blkno == di->i_last_eb_blk)
+ *is_last = 1;
+ else if (eb->h_next_leaf_blk == di->i_last_eb_blk) {
+ ret = ocfs2_last_eb_is_empty(inode, di);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ if (ret == 1)
+ *is_last = 1;
+ }
+ }
+ }
+
+out_hole:
+ ret = 0;
+out:
+ brelse(eb_bh);
+ return ret;
+}
+
+static void ocfs2_relative_extent_offsets(struct super_block *sb,
+ u32 v_cluster,
+ struct ocfs2_extent_rec *rec,
+ u32 *p_cluster, u32 *num_clusters)
+
+{
+ u32 coff = v_cluster - le32_to_cpu(rec->e_cpos);
+
+ *p_cluster = ocfs2_blocks_to_clusters(sb, le64_to_cpu(rec->e_blkno));
+ *p_cluster = *p_cluster + coff;
+
+ if (num_clusters)
+ *num_clusters = le16_to_cpu(rec->e_leaf_clusters) - coff;
+}
+
+int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
+ u32 *p_cluster, u32 *num_clusters,
+ struct ocfs2_extent_list *el)
+{
+ int ret = 0, i;
+ struct buffer_head *eb_bh = NULL;
+ struct ocfs2_extent_block *eb;
+ struct ocfs2_extent_rec *rec;
+ u32 coff;
+
+ if (el->l_tree_depth) {
+ ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
}
+
+ eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+ el = &eb->h_list;
+
+ if (el->l_tree_depth) {
+ ocfs2_error(inode->i_sb,
+ "Inode %lu has non zero tree depth in "
+ "xattr leaf block %llu\n", inode->i_ino,
+ (unsigned long long)eb_bh->b_blocknr);
+ ret = -EROFS;
+ goto out;
+ }
+ }
+
+ i = ocfs2_search_extent_list(el, v_cluster);
+ if (i == -1) {
+ ret = -EROFS;
+ mlog_errno(ret);
+ goto out;
} else {
rec = &el->l_recs[i];
-
BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
if (!rec->e_blkno) {
ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
- "record (%u, %u, 0)", inode->i_ino,
+ "record (%u, %u, 0) in xattr", inode->i_ino,
le32_to_cpu(rec->e_cpos),
ocfs2_rec_clusters(el, rec));
ret = -EROFS;
goto out;
}
-
coff = v_cluster - le32_to_cpu(rec->e_cpos);
-
*p_cluster = ocfs2_blocks_to_clusters(inode->i_sb,
le64_to_cpu(rec->e_blkno));
*p_cluster = *p_cluster + coff;
-
if (num_clusters)
*num_clusters = ocfs2_rec_clusters(el, rec) - coff;
+ }
+out:
+ if (eb_bh)
+ brelse(eb_bh);
+ return ret;
+}
+
+int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
+ u32 *p_cluster, u32 *num_clusters,
+ unsigned int *extent_flags)
+{
+ int ret;
+ unsigned int uninitialized_var(hole_len), flags = 0;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_extent_rec rec;
+
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ ret = -ERANGE;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
+ num_clusters, extent_flags);
+ if (ret == 0)
+ goto out;
+
+ ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
- flags = rec->e_flags;
+ ret = ocfs2_get_clusters_nocache(inode, di_bh, v_cluster, &hole_len,
+ &rec, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (rec.e_blkno == 0ULL) {
+ /*
+ * A hole was found. Return some canned values that
+ * callers can key on. If asked for, num_clusters will
+ * be populated with the size of the hole.
+ */
+ *p_cluster = 0;
+ if (num_clusters) {
+ *num_clusters = hole_len;
+ }
+ } else {
+ ocfs2_relative_extent_offsets(inode->i_sb, v_cluster, &rec,
+ p_cluster, num_clusters);
+ flags = rec.e_flags;
- ocfs2_extent_map_insert_rec(inode, rec);
+ ocfs2_extent_map_insert_rec(inode, &rec);
}
if (extent_flags)
@@ -478,7 +666,6 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
out:
brelse(di_bh);
- brelse(eb_bh);
return ret;
}
@@ -521,3 +708,114 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
out:
return ret;
}
+
+static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
+ struct fiemap_extent_info *fieinfo,
+ u64 map_start)
+{
+ int ret;
+ unsigned int id_count;
+ struct ocfs2_dinode *di;
+ u64 phys;
+ u32 flags = FIEMAP_EXTENT_DATA_INLINE|FIEMAP_EXTENT_LAST;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+ id_count = le16_to_cpu(di->id2.i_data.id_count);
+
+ if (map_start < id_count) {
+ phys = oi->ip_blkno << inode->i_sb->s_blocksize_bits;
+ phys += offsetof(struct ocfs2_dinode, id2.i_data.id_data);
+
+ ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count,
+ flags);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+#define OCFS2_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC)
+
+int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 map_start, u64 map_len)
+{
+ int ret, is_last;
+ u32 mapping_end, cpos;
+ unsigned int hole_size;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ u64 len_bytes, phys_bytes, virt_bytes;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_extent_rec rec;
+
+ ret = fiemap_check_flags(fieinfo, OCFS2_FIEMAP_FLAGS);
+ if (ret)
+ return ret;
+
+ ret = ocfs2_inode_lock(inode, &di_bh, 0);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+ /*
+ * Handle inline-data separately.
+ */
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ ret = ocfs2_fiemap_inline(inode, di_bh, fieinfo, map_start);
+ goto out_unlock;
+ }
+
+ cpos = map_start >> osb->s_clustersize_bits;
+ mapping_end = ocfs2_clusters_for_bytes(inode->i_sb,
+ map_start + map_len);
+ mapping_end -= cpos;
+ is_last = 0;
+ while (cpos < mapping_end && !is_last) {
+ u32 fe_flags;
+
+ ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos,
+ &hole_size, &rec, &is_last);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (rec.e_blkno == 0ULL) {
+ cpos += hole_size;
+ continue;
+ }
+
+ fe_flags = 0;
+ if (rec.e_flags & OCFS2_EXT_UNWRITTEN)
+ fe_flags |= FIEMAP_EXTENT_UNWRITTEN;
+ if (is_last)
+ fe_flags |= FIEMAP_EXTENT_LAST;
+ len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits;
+ phys_bytes = le64_to_cpu(rec.e_blkno) << osb->sb->s_blocksize_bits;
+ virt_bytes = (u64)le32_to_cpu(rec.e_cpos) << osb->s_clustersize_bits;
+
+ ret = fiemap_fill_next_extent(fieinfo, virt_bytes, phys_bytes,
+ len_bytes, fe_flags);
+ if (ret)
+ break;
+
+ cpos = le32_to_cpu(rec.e_cpos)+ le16_to_cpu(rec.e_leaf_clusters);
+ }
+
+ if (ret > 0)
+ ret = 0;
+
+out_unlock:
+ brelse(di_bh);
+
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+ ocfs2_inode_unlock(inode, 0);
+out:
+
+ return ret;
+}
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index de91e3e41a2..1c4aa8b06f3 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -50,4 +50,11 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster,
int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
u64 *ret_count, unsigned int *extent_flags);
+int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 map_start, u64 map_len);
+
+int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
+ u32 *p_cluster, u32 *num_clusters,
+ struct ocfs2_extent_list *el);
+
#endif /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 57e0d30cde9..8d3225a7807 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -55,6 +55,7 @@
#include "mmap.h"
#include "suballoc.h"
#include "super.h"
+#include "xattr.h"
#include "buffer_head_io.h"
@@ -184,7 +185,7 @@ static int ocfs2_sync_file(struct file *file,
goto bail;
journal = osb->journal->j_journal;
- err = journal_force_commit(journal);
+ err = jbd2_journal_force_commit(journal);
bail:
mlog_exit(err);
@@ -488,7 +489,7 @@ bail:
}
/*
- * extend allocation only here.
+ * extend file allocation only here.
* we'll update all the disk stuff, and oip->alloc_size
*
* expect stuff to be locked, a transaction started and enough data /
@@ -497,189 +498,25 @@ bail:
* Will return -EAGAIN, and a reason if a restart is needed.
* If passed in, *reason will always be set, even in error.
*/
-int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
- struct inode *inode,
- u32 *logical_offset,
- u32 clusters_to_add,
- int mark_unwritten,
- struct buffer_head *fe_bh,
- handle_t *handle,
- struct ocfs2_alloc_context *data_ac,
- struct ocfs2_alloc_context *meta_ac,
- enum ocfs2_alloc_restarted *reason_ret)
+int ocfs2_add_inode_data(struct ocfs2_super *osb,
+ struct inode *inode,
+ u32 *logical_offset,
+ u32 clusters_to_add,
+ int mark_unwritten,
+ struct buffer_head *fe_bh,
+ handle_t *handle,
+ struct ocfs2_alloc_context *data_ac,
+ struct ocfs2_alloc_context *meta_ac,
+ enum ocfs2_alloc_restarted *reason_ret)
{
- int status = 0;
- int free_extents;
- struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
- enum ocfs2_alloc_restarted reason = RESTART_NONE;
- u32 bit_off, num_bits;
- u64 block;
- u8 flags = 0;
-
- BUG_ON(!clusters_to_add);
-
- if (mark_unwritten)
- flags = OCFS2_EXT_UNWRITTEN;
-
- free_extents = ocfs2_num_free_extents(osb, inode, fe);
- if (free_extents < 0) {
- status = free_extents;
- mlog_errno(status);
- goto leave;
- }
-
- /* there are two cases which could cause us to EAGAIN in the
- * we-need-more-metadata case:
- * 1) we haven't reserved *any*
- * 2) we are so fragmented, we've needed to add metadata too
- * many times. */
- if (!free_extents && !meta_ac) {
- mlog(0, "we haven't reserved any metadata!\n");
- status = -EAGAIN;
- reason = RESTART_META;
- goto leave;
- } else if ((!free_extents)
- && (ocfs2_alloc_context_bits_left(meta_ac)
- < ocfs2_extend_meta_needed(fe))) {
- mlog(0, "filesystem is really fragmented...\n");
- status = -EAGAIN;
- reason = RESTART_META;
- goto leave;
- }
-
- status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
- clusters_to_add, &bit_off, &num_bits);
- if (status < 0) {
- if (status != -ENOSPC)
- mlog_errno(status);
- goto leave;
- }
-
- BUG_ON(num_bits > clusters_to_add);
-
- /* reserve our write early -- insert_extent may update the inode */
- status = ocfs2_journal_access(handle, inode, fe_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
-
- block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
- mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
- num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
- status = ocfs2_insert_extent(osb, handle, inode, fe_bh,
- *logical_offset, block, num_bits,
- flags, meta_ac);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
-
- status = ocfs2_journal_dirty(handle, fe_bh);
- if (status < 0) {
- mlog_errno(status);
- goto leave;
- }
-
- clusters_to_add -= num_bits;
- *logical_offset += num_bits;
-
- if (clusters_to_add) {
- mlog(0, "need to alloc once more, clusters = %u, wanted = "
- "%u\n", fe->i_clusters, clusters_to_add);
- status = -EAGAIN;
- reason = RESTART_TRANS;
- }
-
-leave:
- mlog_exit(status);
- if (reason_ret)
- *reason_ret = reason;
- return status;
-}
-
-/*
- * For a given allocation, determine which allocators will need to be
- * accessed, and lock them, reserving the appropriate number of bits.
- *
- * Sparse file systems call this from ocfs2_write_begin_nolock()
- * and ocfs2_allocate_unwritten_extents().
- *
- * File systems which don't support holes call this from
- * ocfs2_extend_allocation().
- */
-int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
- u32 clusters_to_add, u32 extents_to_split,
- struct ocfs2_alloc_context **data_ac,
- struct ocfs2_alloc_context **meta_ac)
-{
- int ret = 0, num_free_extents;
- unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-
- *meta_ac = NULL;
- if (data_ac)
- *data_ac = NULL;
-
- BUG_ON(clusters_to_add != 0 && data_ac == NULL);
-
- mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
- "clusters_to_add = %u, extents_to_split = %u\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno, (long long)i_size_read(inode),
- le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split);
-
- num_free_extents = ocfs2_num_free_extents(osb, inode, di);
- if (num_free_extents < 0) {
- ret = num_free_extents;
- mlog_errno(ret);
- goto out;
- }
-
- /*
- * Sparse allocation file systems need to be more conservative
- * with reserving room for expansion - the actual allocation
- * happens while we've got a journal handle open so re-taking
- * a cluster lock (because we ran out of room for another
- * extent) will violate ordering rules.
- *
- * Most of the time we'll only be seeing this 1 cluster at a time
- * anyway.
- *
- * Always lock for any unwritten extents - we might want to
- * add blocks during a split.
- */
- if (!num_free_extents ||
- (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
- ret = ocfs2_reserve_new_metadata(osb, di, meta_ac);
- if (ret < 0) {
- if (ret != -ENOSPC)
- mlog_errno(ret);
- goto out;
- }
- }
-
- if (clusters_to_add == 0)
- goto out;
-
- ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
- if (ret < 0) {
- if (ret != -ENOSPC)
- mlog_errno(ret);
- goto out;
- }
+ int ret;
+ struct ocfs2_extent_tree et;
-out:
- if (ret) {
- if (*meta_ac) {
- ocfs2_free_alloc_context(*meta_ac);
- *meta_ac = NULL;
- }
-
- /*
- * We cannot have an error and a non null *data_ac.
- */
- }
+ ocfs2_init_dinode_extent_tree(&et, inode, fe_bh);
+ ret = ocfs2_add_clusters_in_btree(osb, inode, logical_offset,
+ clusters_to_add, mark_unwritten,
+ &et, handle,
+ data_ac, meta_ac, reason_ret);
return ret;
}
@@ -698,6 +535,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
struct ocfs2_alloc_context *meta_ac = NULL;
enum ocfs2_alloc_restarted why;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_extent_tree et;
mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
@@ -707,8 +545,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
*/
BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
- status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
- OCFS2_BH_CACHED, inode);
+ status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
if (status < 0) {
mlog_errno(status);
goto leave;
@@ -724,14 +561,21 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
restart_all:
BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
- status = ocfs2_lock_allocators(inode, fe, clusters_to_add, 0, &data_ac,
- &meta_ac);
+ mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
+ "clusters_to_add = %u\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters),
+ clusters_to_add);
+ ocfs2_init_dinode_extent_tree(&et, inode, bh);
+ status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
+ &data_ac, &meta_ac);
if (status) {
mlog_errno(status);
goto leave;
}
- credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
+ credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list,
+ clusters_to_add);
handle = ocfs2_start_trans(osb, credits);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
@@ -753,16 +597,16 @@ restarted_transaction:
prev_clusters = OCFS2_I(inode)->ip_clusters;
- status = ocfs2_do_extend_allocation(osb,
- inode,
- &logical_start,
- clusters_to_add,
- mark_unwritten,
- bh,
- handle,
- data_ac,
- meta_ac,
- &why);
+ status = ocfs2_add_inode_data(osb,
+ inode,
+ &logical_start,
+ clusters_to_add,
+ mark_unwritten,
+ bh,
+ handle,
+ data_ac,
+ meta_ac,
+ &why);
if ((status < 0) && (status != -EAGAIN)) {
if (status != -ENOSPC)
mlog_errno(status);
@@ -789,7 +633,7 @@ restarted_transaction:
mlog(0, "restarting transaction.\n");
/* TODO: This can be more intelligent. */
credits = ocfs2_calc_extend_credits(osb->sb,
- fe,
+ &fe->id2.i_list,
clusters_to_add);
status = ocfs2_extend_trans(handle, credits);
if (status < 0) {
@@ -826,10 +670,8 @@ leave:
restart_func = 0;
goto restart_all;
}
- if (bh) {
- brelse(bh);
- bh = NULL;
- }
+ brelse(bh);
+ bh = NULL;
mlog_exit(status);
return status;
@@ -1096,9 +938,15 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
goto bail_unlock;
}
- if (i_size_read(inode) > attr->ia_size)
+ if (i_size_read(inode) > attr->ia_size) {
+ if (ocfs2_should_order_data(inode)) {
+ status = ocfs2_begin_ordered_truncate(inode,
+ attr->ia_size);
+ if (status)
+ goto bail_unlock;
+ }
status = ocfs2_truncate_file(inode, bh, attr->ia_size);
- else
+ } else
status = ocfs2_extend_file(inode, bh, attr->ia_size);
if (status < 0) {
if (status != -ENOSPC)
@@ -1140,8 +988,7 @@ bail_unlock_rw:
if (size_change)
ocfs2_rw_unlock(inode, 1);
bail:
- if (bh)
- brelse(bh);
+ brelse(bh);
mlog_exit(status);
return status;
@@ -1176,7 +1023,7 @@ bail:
return err;
}
-int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
+int ocfs2_permission(struct inode *inode, int mask)
{
int ret;
@@ -1284,8 +1131,7 @@ static int ocfs2_write_remove_suid(struct inode *inode)
struct buffer_head *bh = NULL;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
- ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
- oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
+ ret = ocfs2_read_block(inode, oi->ip_blkno, &bh);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -1311,9 +1157,8 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode,
struct buffer_head *di_bh = NULL;
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
- ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
- OCFS2_I(inode)->ip_blkno, &di_bh,
- OCFS2_BH_CACHED, inode);
+ ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno,
+ &di_bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -1394,8 +1239,11 @@ static int __ocfs2_remove_inode_range(struct inode *inode,
handle_t *handle;
struct ocfs2_alloc_context *meta_ac = NULL;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_extent_tree et;
+
+ ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
- ret = ocfs2_lock_allocators(inode, di, 0, 1, NULL, &meta_ac);
+ ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
if (ret) {
mlog_errno(ret);
return ret;
@@ -1425,7 +1273,7 @@ static int __ocfs2_remove_inode_range(struct inode *inode,
goto out;
}
- ret = ocfs2_remove_extent(inode, di_bh, cpos, len, handle, meta_ac,
+ ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
dealloc);
if (ret) {
mlog_errno(ret);
@@ -1766,8 +1614,8 @@ out_inode_unlock:
out_rw_unlock:
ocfs2_rw_unlock(inode, 1);
- mutex_unlock(&inode->i_mutex);
out:
+ mutex_unlock(&inode->i_mutex);
return ret;
}
@@ -2040,7 +1888,7 @@ out_dio:
*/
if (old_size != i_size_read(inode) ||
old_clusters != OCFS2_I(inode)->ip_clusters) {
- ret = journal_force_commit(osb->journal->j_journal);
+ ret = jbd2_journal_force_commit(osb->journal->j_journal);
if (ret < 0)
written = ret;
}
@@ -2202,7 +2050,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
if (ret == -EINVAL)
- mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n");
+ mlog(0, "generic_file_aio_read returned -EINVAL\n");
/* buffered aio wouldn't have proper lock coverage today */
BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
@@ -2227,7 +2075,12 @@ const struct inode_operations ocfs2_file_iops = {
.setattr = ocfs2_setattr,
.getattr = ocfs2_getattr,
.permission = ocfs2_permission,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = ocfs2_listxattr,
+ .removexattr = generic_removexattr,
.fallocate = ocfs2_fallocate,
+ .fiemap = ocfs2_fiemap,
};
const struct inode_operations ocfs2_special_file_iops = {
@@ -2236,6 +2089,10 @@ const struct inode_operations ocfs2_special_file_iops = {
.permission = ocfs2_permission,
};
+/*
+ * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with
+ * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
+ */
const struct file_operations ocfs2_fops = {
.llseek = generic_file_llseek,
.read = do_sync_read,
@@ -2250,6 +2107,7 @@ const struct file_operations ocfs2_fops = {
#ifdef CONFIG_COMPAT
.compat_ioctl = ocfs2_compat_ioctl,
#endif
+ .lock = ocfs2_lock,
.flock = ocfs2_flock,
.splice_read = ocfs2_file_splice_read,
.splice_write = ocfs2_file_splice_write,
@@ -2266,5 +2124,51 @@ const struct file_operations ocfs2_dops = {
#ifdef CONFIG_COMPAT
.compat_ioctl = ocfs2_compat_ioctl,
#endif
+ .lock = ocfs2_lock,
+ .flock = ocfs2_flock,
+};
+
+/*
+ * POSIX-lockless variants of our file_operations.
+ *
+ * These will be used if the underlying cluster stack does not support
+ * posix file locking, if the user passes the "localflocks" mount
+ * option, or if we have a local-only fs.
+ *
+ * ocfs2_flock is in here because all stacks handle UNIX file locks,
+ * so we still want it in the case of no stack support for
+ * plocks. Internally, it will do the right thing when asked to ignore
+ * the cluster.
+ */
+const struct file_operations ocfs2_fops_no_plocks = {
+ .llseek = generic_file_llseek,
+ .read = do_sync_read,
+ .write = do_sync_write,
+ .mmap = ocfs2_mmap,
+ .fsync = ocfs2_sync_file,
+ .release = ocfs2_file_release,
+ .open = ocfs2_file_open,
+ .aio_read = ocfs2_file_aio_read,
+ .aio_write = ocfs2_file_aio_write,
+ .unlocked_ioctl = ocfs2_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = ocfs2_compat_ioctl,
+#endif
+ .flock = ocfs2_flock,
+ .splice_read = ocfs2_file_splice_read,
+ .splice_write = ocfs2_file_splice_write,
+};
+
+const struct file_operations ocfs2_dops_no_plocks = {
+ .llseek = generic_file_llseek,
+ .read = generic_read_dir,
+ .readdir = ocfs2_readdir,
+ .fsync = ocfs2_sync_file,
+ .release = ocfs2_dir_release,
+ .open = ocfs2_dir_open,
+ .unlocked_ioctl = ocfs2_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = ocfs2_compat_ioctl,
+#endif
.flock = ocfs2_flock,
};
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 048ddcaf5c8..e92382cbca5 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -28,9 +28,12 @@
extern const struct file_operations ocfs2_fops;
extern const struct file_operations ocfs2_dops;
+extern const struct file_operations ocfs2_fops_no_plocks;
+extern const struct file_operations ocfs2_dops_no_plocks;
extern const struct inode_operations ocfs2_file_iops;
extern const struct inode_operations ocfs2_special_file_iops;
struct ocfs2_alloc_context;
+enum ocfs2_alloc_restarted;
struct ocfs2_file_private {
struct file *fp_file;
@@ -38,32 +41,22 @@ struct ocfs2_file_private {
struct ocfs2_lock_res fp_flock;
};
-enum ocfs2_alloc_restarted {
- RESTART_NONE = 0,
- RESTART_TRANS,
- RESTART_META
-};
-int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
- struct inode *inode,
- u32 *logical_offset,
- u32 clusters_to_add,
- int mark_unwritten,
- struct buffer_head *fe_bh,
- handle_t *handle,
- struct ocfs2_alloc_context *data_ac,
- struct ocfs2_alloc_context *meta_ac,
- enum ocfs2_alloc_restarted *reason_ret);
+int ocfs2_add_inode_data(struct ocfs2_super *osb,
+ struct inode *inode,
+ u32 *logical_offset,
+ u32 clusters_to_add,
+ int mark_unwritten,
+ struct buffer_head *fe_bh,
+ handle_t *handle,
+ struct ocfs2_alloc_context *data_ac,
+ struct ocfs2_alloc_context *meta_ac,
+ enum ocfs2_alloc_restarted *reason_ret);
int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
u64 zero_to);
-int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
- u32 clusters_to_add, u32 extents_to_split,
- struct ocfs2_alloc_context **data_ac,
- struct ocfs2_alloc_context **meta_ac);
int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat);
-int ocfs2_permission(struct inode *inode, int mask,
- struct nameidata *nd);
+int ocfs2_permission(struct inode *inode, int mask);
int ocfs2_should_update_atime(struct inode *inode,
struct vfsmount *vfsmnt);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 7e9e4c79aec..4903688f72a 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -49,6 +49,7 @@
#include "symlink.h"
#include "sysfile.h"
#include "uptodate.h"
+#include "xattr.h"
#include "buffer_head_io.h"
@@ -219,6 +220,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
struct super_block *sb;
struct ocfs2_super *osb;
int status = -EINVAL;
+ int use_plocks = 1;
mlog_entry("(0x%p, size:%llu)\n", inode,
(unsigned long long)le64_to_cpu(fe->i_size));
@@ -226,6 +228,10 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
sb = inode->i_sb;
osb = OCFS2_SB(sb);
+ if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) ||
+ ocfs2_mount_local(osb) || !ocfs2_stack_supports_plocks())
+ use_plocks = 0;
+
/* this means that read_inode cannot create a superblock inode
* today. change if needed. */
if (!OCFS2_IS_VALID_DINODE(fe) ||
@@ -295,13 +301,19 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
- inode->i_fop = &ocfs2_fops;
+ if (use_plocks)
+ inode->i_fop = &ocfs2_fops;
+ else
+ inode->i_fop = &ocfs2_fops_no_plocks;
inode->i_op = &ocfs2_file_iops;
i_size_write(inode, le64_to_cpu(fe->i_size));
break;
case S_IFDIR:
inode->i_op = &ocfs2_dir_iops;
- inode->i_fop = &ocfs2_dops;
+ if (use_plocks)
+ inode->i_fop = &ocfs2_dops;
+ else
+ inode->i_fop = &ocfs2_dops_no_plocks;
i_size_write(inode, le64_to_cpu(fe->i_size));
break;
case S_IFLNK:
@@ -448,8 +460,11 @@ static int ocfs2_read_locked_inode(struct inode *inode,
}
}
- status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0,
- can_lock ? inode : NULL);
+ if (can_lock)
+ status = ocfs2_read_blocks(inode, args->fi_blkno, 1, &bh,
+ OCFS2_BH_IGNORE_CACHE);
+ else
+ status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -522,6 +537,9 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
* data and fast symlinks.
*/
if (fe->i_clusters) {
+ if (ocfs2_should_order_data(inode))
+ ocfs2_begin_ordered_truncate(inode, 0);
+
handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
@@ -730,6 +748,13 @@ static int ocfs2_wipe_inode(struct inode *inode,
goto bail_unlock_dir;
}
+ /*Free extended attribute resources associated with this inode.*/
+ status = ocfs2_xattr_remove(inode, di_bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail_unlock_dir;
+ }
+
status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode,
orphan_dir_bh);
if (status < 0)
@@ -1081,6 +1106,8 @@ void ocfs2_clear_inode(struct inode *inode)
oi->ip_last_trans = 0;
oi->ip_dir_start_lookup = 0;
oi->ip_blkno = 0ULL;
+ jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal,
+ &oi->ip_jinode);
bail:
mlog_exit_void();
@@ -1107,58 +1134,6 @@ void ocfs2_drop_inode(struct inode *inode)
}
/*
- * TODO: this should probably be merged into ocfs2_get_block
- *
- * However, you now need to pay attention to the cont_prepare_write()
- * stuff in ocfs2_get_block (that is, ocfs2_get_block pretty much
- * expects never to extend).
- */
-struct buffer_head *ocfs2_bread(struct inode *inode,
- int block, int *err, int reada)
-{
- struct buffer_head *bh = NULL;
- int tmperr;
- u64 p_blkno;
- int readflags = OCFS2_BH_CACHED;
-
- if (reada)
- readflags |= OCFS2_BH_READAHEAD;
-
- if (((u64)block << inode->i_sb->s_blocksize_bits) >=
- i_size_read(inode)) {
- BUG_ON(!reada);
- return NULL;
- }
-
- down_read(&OCFS2_I(inode)->ip_alloc_sem);
- tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
- NULL);
- up_read(&OCFS2_I(inode)->ip_alloc_sem);
- if (tmperr < 0) {
- mlog_errno(tmperr);
- goto fail;
- }
-
- tmperr = ocfs2_read_block(OCFS2_SB(inode->i_sb), p_blkno, &bh,
- readflags, inode);
- if (tmperr < 0)
- goto fail;
-
- tmperr = 0;
-
- *err = 0;
- return bh;
-
-fail:
- if (bh) {
- brelse(bh);
- bh = NULL;
- }
- *err = -EIO;
- return NULL;
-}
-
-/*
* This is called from our getattr.
*/
int ocfs2_inode_revalidate(struct dentry *dentry)
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 390a85596aa..2f37af9bcc4 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -40,6 +40,9 @@ struct ocfs2_inode_info
/* protects allocation changes on this inode. */
struct rw_semaphore ip_alloc_sem;
+ /* protects extended attribute changes on this inode */
+ struct rw_semaphore ip_xattr_sem;
+
/* These fields are protected by ip_lock */
spinlock_t ip_lock;
u32 ip_open_count;
@@ -68,6 +71,7 @@ struct ocfs2_inode_info
struct ocfs2_extent_map ip_extent_map;
struct inode vfs_inode;
+ struct jbd2_inode ip_jinode;
};
/*
@@ -113,8 +117,6 @@ extern struct kmem_cache *ocfs2_inode_cache;
extern const struct address_space_operations ocfs2_aops;
-struct buffer_head *ocfs2_bread(struct inode *inode, int block,
- int *err, int reada);
void ocfs2_clear_inode(struct inode *inode);
void ocfs2_delete_inode(struct inode *inode);
void ocfs2_drop_inode(struct inode *inode);
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 7b142f0ce99..9fcd36dcc9a 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -102,8 +102,7 @@ bail_unlock:
bail:
mutex_unlock(&inode->i_mutex);
- if (bh)
- brelse(bh);
+ brelse(bh);
mlog_exit(status);
return status;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 9698338adc3..81e40677eec 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -57,7 +57,7 @@ static int __ocfs2_recovery_thread(void *arg);
static int ocfs2_commit_cache(struct ocfs2_super *osb);
static int ocfs2_wait_on_mount(struct ocfs2_super *osb);
static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
- int dirty);
+ int dirty, int replayed);
static int ocfs2_trylock_journal(struct ocfs2_super *osb,
int slot_num);
static int ocfs2_recover_orphans(struct ocfs2_super *osb,
@@ -215,9 +215,9 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
goto finally;
}
- journal_lock_updates(journal->j_journal);
- status = journal_flush(journal->j_journal);
- journal_unlock_updates(journal->j_journal);
+ jbd2_journal_lock_updates(journal->j_journal);
+ status = jbd2_journal_flush(journal->j_journal);
+ jbd2_journal_unlock_updates(journal->j_journal);
if (status < 0) {
up_write(&journal->j_trans_barrier);
mlog_errno(status);
@@ -264,7 +264,7 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
down_read(&osb->journal->j_trans_barrier);
- handle = journal_start(journal, max_buffs);
+ handle = jbd2_journal_start(journal, max_buffs);
if (IS_ERR(handle)) {
up_read(&osb->journal->j_trans_barrier);
@@ -290,7 +290,7 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
BUG_ON(!handle);
- ret = journal_stop(handle);
+ ret = jbd2_journal_stop(handle);
if (ret < 0)
mlog_errno(ret);
@@ -304,7 +304,7 @@ int ocfs2_commit_trans(struct ocfs2_super *osb,
* transaction. extend_trans will either extend the current handle by
* nblocks, or commit it and start a new one with nblocks credits.
*
- * This might call journal_restart() which will commit dirty buffers
+ * This might call jbd2_journal_restart() which will commit dirty buffers
* and then restart the transaction. Before calling
* ocfs2_extend_trans(), any changed blocks should have been
* dirtied. After calling it, all blocks which need to be changed must
@@ -329,10 +329,10 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);
-#ifdef OCFS2_DEBUG_FS
+#ifdef CONFIG_OCFS2_DEBUG_FS
status = 1;
#else
- status = journal_extend(handle, nblocks);
+ status = jbd2_journal_extend(handle, nblocks);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -340,8 +340,10 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
#endif
if (status > 0) {
- mlog(0, "journal_extend failed, trying journal_restart\n");
- status = journal_restart(handle, nblocks);
+ mlog(0,
+ "jbd2_journal_extend failed, trying "
+ "jbd2_journal_restart\n");
+ status = jbd2_journal_restart(handle, nblocks);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -393,11 +395,11 @@ int ocfs2_journal_access(handle_t *handle,
switch (type) {
case OCFS2_JOURNAL_ACCESS_CREATE:
case OCFS2_JOURNAL_ACCESS_WRITE:
- status = journal_get_write_access(handle, bh);
+ status = jbd2_journal_get_write_access(handle, bh);
break;
case OCFS2_JOURNAL_ACCESS_UNDO:
- status = journal_get_undo_access(handle, bh);
+ status = jbd2_journal_get_undo_access(handle, bh);
break;
default:
@@ -422,7 +424,7 @@ int ocfs2_journal_dirty(handle_t *handle,
mlog_entry("(bh->b_blocknr=%llu)\n",
(unsigned long long)bh->b_blocknr);
- status = journal_dirty_metadata(handle, bh);
+ status = jbd2_journal_dirty_metadata(handle, bh);
if (status < 0)
mlog(ML_ERROR, "Could not dirty metadata buffer. "
"(bh->b_blocknr=%llu)\n",
@@ -432,6 +434,7 @@ int ocfs2_journal_dirty(handle_t *handle,
return status;
}
+#ifdef CONFIG_OCFS2_COMPAT_JBD
int ocfs2_journal_dirty_data(handle_t *handle,
struct buffer_head *bh)
{
@@ -443,8 +446,9 @@ int ocfs2_journal_dirty_data(handle_t *handle,
return err;
}
+#endif
-#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD_DEFAULT_MAX_COMMIT_AGE)
+#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
void ocfs2_set_journal_params(struct ocfs2_super *osb)
{
@@ -457,9 +461,9 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb)
spin_lock(&journal->j_state_lock);
journal->j_commit_interval = commit_interval;
if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
- journal->j_flags |= JFS_BARRIER;
+ journal->j_flags |= JBD2_BARRIER;
else
- journal->j_flags &= ~JFS_BARRIER;
+ journal->j_flags &= ~JBD2_BARRIER;
spin_unlock(&journal->j_state_lock);
}
@@ -524,14 +528,14 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters);
/* call the kernels journal init function now */
- j_journal = journal_init_inode(inode);
+ j_journal = jbd2_journal_init_inode(inode);
if (j_journal == NULL) {
mlog(ML_ERROR, "Linux journal layer error\n");
status = -EINVAL;
goto done;
}
- mlog(0, "Returned from journal_init_inode\n");
+ mlog(0, "Returned from jbd2_journal_init_inode\n");
mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen);
*dirty = (le32_to_cpu(di->id1.journal1.ij_flags) &
@@ -550,8 +554,7 @@ done:
if (status < 0) {
if (inode_lock)
ocfs2_inode_unlock(inode, 1);
- if (bh != NULL)
- brelse(bh);
+ brelse(bh);
if (inode) {
OCFS2_I(inode)->ip_open_count--;
iput(inode);
@@ -562,8 +565,18 @@ done:
return status;
}
+static void ocfs2_bump_recovery_generation(struct ocfs2_dinode *di)
+{
+ le32_add_cpu(&(di->id1.journal1.ij_recovery_generation), 1);
+}
+
+static u32 ocfs2_get_recovery_generation(struct ocfs2_dinode *di)
+{
+ return le32_to_cpu(di->id1.journal1.ij_recovery_generation);
+}
+
static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
- int dirty)
+ int dirty, int replayed)
{
int status;
unsigned int flags;
@@ -593,6 +606,9 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
flags &= ~OCFS2_JOURNAL_DIRTY_FL;
fe->id1.journal1.ij_flags = cpu_to_le32(flags);
+ if (replayed)
+ ocfs2_bump_recovery_generation(fe);
+
status = ocfs2_write_block(osb, bh, journal->j_inode);
if (status < 0)
mlog_errno(status);
@@ -626,7 +642,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
if (journal->j_state != OCFS2_JOURNAL_LOADED)
goto done;
- /* need to inc inode use count as journal_destroy will iput. */
+ /* need to inc inode use count - jbd2_journal_destroy will iput. */
if (!igrab(inode))
BUG();
@@ -655,9 +671,9 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0);
if (ocfs2_mount_local(osb)) {
- journal_lock_updates(journal->j_journal);
- status = journal_flush(journal->j_journal);
- journal_unlock_updates(journal->j_journal);
+ jbd2_journal_lock_updates(journal->j_journal);
+ status = jbd2_journal_flush(journal->j_journal);
+ jbd2_journal_unlock_updates(journal->j_journal);
if (status < 0)
mlog_errno(status);
}
@@ -667,13 +683,13 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
* Do not toggle if flush was unsuccessful otherwise
* will leave dirty metadata in a "clean" journal
*/
- status = ocfs2_journal_toggle_dirty(osb, 0);
+ status = ocfs2_journal_toggle_dirty(osb, 0, 0);
if (status < 0)
mlog_errno(status);
}
/* Shutdown the kernel journal system */
- journal_destroy(journal->j_journal);
+ jbd2_journal_destroy(journal->j_journal);
OCFS2_I(inode)->ip_open_count--;
@@ -698,19 +714,19 @@ static void ocfs2_clear_journal_error(struct super_block *sb,
{
int olderr;
- olderr = journal_errno(journal);
+ olderr = jbd2_journal_errno(journal);
if (olderr) {
mlog(ML_ERROR, "File system error %d recorded in "
"journal %u.\n", olderr, slot);
mlog(ML_ERROR, "File system on device %s needs checking.\n",
sb->s_id);
- journal_ack_err(journal);
- journal_clear_err(journal);
+ jbd2_journal_ack_err(journal);
+ jbd2_journal_clear_err(journal);
}
}
-int ocfs2_journal_load(struct ocfs2_journal *journal, int local)
+int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
{
int status = 0;
struct ocfs2_super *osb;
@@ -721,7 +737,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local)
osb = journal->j_osb;
- status = journal_load(journal->j_journal);
+ status = jbd2_journal_load(journal->j_journal);
if (status < 0) {
mlog(ML_ERROR, "Failed to load journal!\n");
goto done;
@@ -729,7 +745,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local)
ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num);
- status = ocfs2_journal_toggle_dirty(osb, 1);
+ status = ocfs2_journal_toggle_dirty(osb, 1, replayed);
if (status < 0) {
mlog_errno(status);
goto done;
@@ -765,13 +781,13 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
BUG_ON(!journal);
- status = journal_wipe(journal->j_journal, full);
+ status = jbd2_journal_wipe(journal->j_journal, full);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- status = ocfs2_journal_toggle_dirty(journal->j_osb, 0);
+ status = ocfs2_journal_toggle_dirty(journal->j_osb, 0, 0);
if (status < 0)
mlog_errno(status);
@@ -834,9 +850,8 @@ static int ocfs2_force_read_journal(struct inode *inode)
/* We are reading journal data which should not
* be put in the uptodate cache */
- status = ocfs2_read_blocks(OCFS2_SB(inode->i_sb),
- p_blkno, p_blocks, bhs, 0,
- NULL);
+ status = ocfs2_read_blocks_sync(OCFS2_SB(inode->i_sb),
+ p_blkno, p_blocks, bhs);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -852,8 +867,7 @@ static int ocfs2_force_read_journal(struct inode *inode)
bail:
for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
- if (bhs[i])
- brelse(bhs[i]);
+ brelse(bhs[i]);
mlog_exit(status);
return status;
}
@@ -1034,6 +1048,12 @@ restart:
spin_unlock(&osb->osb_lock);
mlog(0, "All nodes recovered\n");
+ /* Refresh all journal recovery generations from disk */
+ status = ocfs2_check_journals_nolocks(osb);
+ status = (status == -EROFS) ? 0 : status;
+ if (status < 0)
+ mlog_errno(status);
+
ocfs2_super_unlock(osb, 1);
/* We always run recovery on our own orphan dir - the dead
@@ -1096,6 +1116,43 @@ out:
mlog_exit_void();
}
+static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
+ int slot_num,
+ struct buffer_head **bh,
+ struct inode **ret_inode)
+{
+ int status = -EACCES;
+ struct inode *inode = NULL;
+
+ BUG_ON(slot_num >= osb->max_slots);
+
+ inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
+ slot_num);
+ if (!inode || is_bad_inode(inode)) {
+ mlog_errno(status);
+ goto bail;
+ }
+ SET_INODE_JOURNAL(inode);
+
+ status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, bh,
+ OCFS2_BH_IGNORE_CACHE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ status = 0;
+
+bail:
+ if (inode) {
+ if (status || !ret_inode)
+ iput(inode);
+ else
+ *ret_inode = inode;
+ }
+ return status;
+}
+
/* Does the actual journal replay and marks the journal inode as
* clean. Will only replay if the journal inode is marked dirty. */
static int ocfs2_replay_journal(struct ocfs2_super *osb,
@@ -1109,22 +1166,36 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
struct ocfs2_dinode *fe;
journal_t *journal = NULL;
struct buffer_head *bh = NULL;
+ u32 slot_reco_gen;
- inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE,
- slot_num);
- if (inode == NULL) {
- status = -EACCES;
+ status = ocfs2_read_journal_inode(osb, slot_num, &bh, &inode);
+ if (status) {
mlog_errno(status);
goto done;
}
- if (is_bad_inode(inode)) {
- status = -EACCES;
- iput(inode);
- inode = NULL;
- mlog_errno(status);
+
+ fe = (struct ocfs2_dinode *)bh->b_data;
+ slot_reco_gen = ocfs2_get_recovery_generation(fe);
+ brelse(bh);
+ bh = NULL;
+
+ /*
+ * As the fs recovery is asynchronous, there is a small chance that
+ * another node mounted (and recovered) the slot before the recovery
+ * thread could get the lock. To handle that, we dirty read the journal
+ * inode for that slot to get the recovery generation. If it is
+ * different than what we expected, the slot has been recovered.
+ * If not, it needs recovery.
+ */
+ if (osb->slot_recovery_generations[slot_num] != slot_reco_gen) {
+ mlog(0, "Slot %u already recovered (old/new=%u/%u)\n", slot_num,
+ osb->slot_recovery_generations[slot_num], slot_reco_gen);
+ osb->slot_recovery_generations[slot_num] = slot_reco_gen;
+ status = -EBUSY;
goto done;
}
- SET_INODE_JOURNAL(inode);
+
+ /* Continue with recovery as the journal has not yet been recovered */
status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
if (status < 0) {
@@ -1138,9 +1209,12 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
fe = (struct ocfs2_dinode *) bh->b_data;
flags = le32_to_cpu(fe->id1.journal1.ij_flags);
+ slot_reco_gen = ocfs2_get_recovery_generation(fe);
if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) {
mlog(0, "No recovery required for node %d\n", node_num);
+ /* Refresh recovery generation for the slot */
+ osb->slot_recovery_generations[slot_num] = slot_reco_gen;
goto done;
}
@@ -1157,19 +1231,19 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
}
mlog(0, "calling journal_init_inode\n");
- journal = journal_init_inode(inode);
+ journal = jbd2_journal_init_inode(inode);
if (journal == NULL) {
mlog(ML_ERROR, "Linux journal layer error\n");
status = -EIO;
goto done;
}
- status = journal_load(journal);
+ status = jbd2_journal_load(journal);
if (status < 0) {
mlog_errno(status);
if (!igrab(inode))
BUG();
- journal_destroy(journal);
+ jbd2_journal_destroy(journal);
goto done;
}
@@ -1177,9 +1251,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
/* wipe the journal */
mlog(0, "flushing the journal.\n");
- journal_lock_updates(journal);
- status = journal_flush(journal);
- journal_unlock_updates(journal);
+ jbd2_journal_lock_updates(journal);
+ status = jbd2_journal_flush(journal);
+ jbd2_journal_unlock_updates(journal);
if (status < 0)
mlog_errno(status);
@@ -1188,6 +1262,11 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
flags &= ~OCFS2_JOURNAL_DIRTY_FL;
fe->id1.journal1.ij_flags = cpu_to_le32(flags);
+ /* Increment recovery generation to indicate successful recovery */
+ ocfs2_bump_recovery_generation(fe);
+ osb->slot_recovery_generations[slot_num] =
+ ocfs2_get_recovery_generation(fe);
+
status = ocfs2_write_block(osb, bh, inode);
if (status < 0)
mlog_errno(status);
@@ -1195,7 +1274,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
if (!igrab(inode))
BUG();
- journal_destroy(journal);
+ jbd2_journal_destroy(journal);
done:
/* drop the lock on this nodes journal */
@@ -1205,8 +1284,7 @@ done:
if (inode)
iput(inode);
- if (bh)
- brelse(bh);
+ brelse(bh);
mlog_exit(status);
return status;
@@ -1252,6 +1330,13 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
status = ocfs2_replay_journal(osb, node_num, slot_num);
if (status < 0) {
+ if (status == -EBUSY) {
+ mlog(0, "Skipping recovery for slot %u (node %u) "
+ "as another node has recovered it\n", slot_num,
+ node_num);
+ status = 0;
+ goto done;
+ }
mlog_errno(status);
goto done;
}
@@ -1334,21 +1419,46 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
{
unsigned int node_num;
int status, i;
+ u32 gen;
+ struct buffer_head *bh = NULL;
+ struct ocfs2_dinode *di;
/* This is called with the super block cluster lock, so we
* know that the slot map can't change underneath us. */
- spin_lock(&osb->osb_lock);
for (i = 0; i < osb->max_slots; i++) {
- if (i == osb->slot_num)
+ /* Read journal inode to get the recovery generation */
+ status = ocfs2_read_journal_inode(osb, i, &bh, NULL);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+ di = (struct ocfs2_dinode *)bh->b_data;
+ gen = ocfs2_get_recovery_generation(di);
+ brelse(bh);
+ bh = NULL;
+
+ spin_lock(&osb->osb_lock);
+ osb->slot_recovery_generations[i] = gen;
+
+ mlog(0, "Slot %u recovery generation is %u\n", i,
+ osb->slot_recovery_generations[i]);
+
+ if (i == osb->slot_num) {
+ spin_unlock(&osb->osb_lock);
continue;
+ }
status = ocfs2_slot_to_node_num_locked(osb, i, &node_num);
- if (status == -ENOENT)
+ if (status == -ENOENT) {
+ spin_unlock(&osb->osb_lock);
continue;
+ }
- if (__ocfs2_recovery_map_test(osb, node_num))
+ if (__ocfs2_recovery_map_test(osb, node_num)) {
+ spin_unlock(&osb->osb_lock);
continue;
+ }
spin_unlock(&osb->osb_lock);
/* Ok, we have a slot occupied by another node which
@@ -1364,10 +1474,7 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
mlog_errno(status);
goto bail;
}
-
- spin_lock(&osb->osb_lock);
}
- spin_unlock(&osb->osb_lock);
status = 0;
bail:
@@ -1603,49 +1710,41 @@ static int ocfs2_commit_thread(void *arg)
return 0;
}
-/* Look for a dirty journal without taking any cluster locks. Used for
- * hard readonly access to determine whether the file system journals
- * require recovery. */
+/* Reads all the journal inodes without taking any cluster locks. Used
+ * for hard readonly access to determine whether any journal requires
+ * recovery. Also used to refresh the recovery generation numbers after
+ * a journal has been recovered by another node.
+ */
int ocfs2_check_journals_nolocks(struct ocfs2_super *osb)
{
int ret = 0;
unsigned int slot;
- struct buffer_head *di_bh;
+ struct buffer_head *di_bh = NULL;
struct ocfs2_dinode *di;
- struct inode *journal = NULL;
+ int journal_dirty = 0;
for(slot = 0; slot < osb->max_slots; slot++) {
- journal = ocfs2_get_system_file_inode(osb,
- JOURNAL_SYSTEM_INODE,
- slot);
- if (!journal || is_bad_inode(journal)) {
- ret = -EACCES;
- mlog_errno(ret);
- goto out;
- }
-
- di_bh = NULL;
- ret = ocfs2_read_block(osb, OCFS2_I(journal)->ip_blkno, &di_bh,
- 0, journal);
- if (ret < 0) {
+ ret = ocfs2_read_journal_inode(osb, slot, &di_bh, NULL);
+ if (ret) {
mlog_errno(ret);
goto out;
}
di = (struct ocfs2_dinode *) di_bh->b_data;
+ osb->slot_recovery_generations[slot] =
+ ocfs2_get_recovery_generation(di);
+
if (le32_to_cpu(di->id1.journal1.ij_flags) &
OCFS2_JOURNAL_DIRTY_FL)
- ret = -EROFS;
+ journal_dirty = 1;
brelse(di_bh);
- if (ret)
- break;
+ di_bh = NULL;
}
out:
- if (journal)
- iput(journal);
-
+ if (journal_dirty)
+ ret = -EROFS;
return ret;
}
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index db82be2532e..d4d14e9a3ce 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -27,7 +27,12 @@
#define OCFS2_JOURNAL_H
#include <linux/fs.h>
-#include <linux/jbd.h>
+#ifndef CONFIG_OCFS2_COMPAT_JBD
+# include <linux/jbd2.h>
+#else
+# include <linux/jbd.h>
+# include "ocfs2_jbd_compat.h"
+#endif
enum ocfs2_journal_state {
OCFS2_JOURNAL_FREE = 0,
@@ -161,7 +166,8 @@ int ocfs2_journal_init(struct ocfs2_journal *journal,
void ocfs2_journal_shutdown(struct ocfs2_super *osb);
int ocfs2_journal_wipe(struct ocfs2_journal *journal,
int full);
-int ocfs2_journal_load(struct ocfs2_journal *journal, int local);
+int ocfs2_journal_load(struct ocfs2_journal *journal, int local,
+ int replayed);
int ocfs2_check_journals_nolocks(struct ocfs2_super *osb);
void ocfs2_recovery_thread(struct ocfs2_super *osb,
int node_num);
@@ -214,8 +220,8 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode)
* buffer. Will have to call ocfs2_journal_dirty once
* we've actually dirtied it. Type is one of . or .
* ocfs2_journal_dirty - Mark a journalled buffer as having dirty data.
- * ocfs2_journal_dirty_data - Indicate that a data buffer should go out before
- * the current handle commits.
+ * ocfs2_jbd2_file_inode - Mark an inode so that its data goes out before
+ * the current handle commits.
*/
/* You must always start_trans with a number of buffs > 0, but it's
@@ -267,8 +273,10 @@ int ocfs2_journal_access(handle_t *handle,
*/
int ocfs2_journal_dirty(handle_t *handle,
struct buffer_head *bh);
+#ifdef CONFIG_OCFS2_COMPAT_JBD
int ocfs2_journal_dirty_data(handle_t *handle,
struct buffer_head *bh);
+#endif
/*
* Credit Macros:
@@ -282,6 +290,9 @@ int ocfs2_journal_dirty_data(handle_t *handle,
/* simple file updates like chmod, etc. */
#define OCFS2_INODE_UPDATE_CREDITS 1
+/* extended attribute block update */
+#define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1
+
/* group extend. inode update and last group update. */
#define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
@@ -339,11 +350,23 @@ int ocfs2_journal_dirty_data(handle_t *handle,
#define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3 \
+ OCFS2_UNLINK_CREDITS)
+/* global bitmap dinode, group desc., relinked group,
+ * suballocator dinode, group desc., relinked group,
+ * dinode, xattr block */
+#define OCFS2_XATTR_BLOCK_CREATE_CREDITS (OCFS2_SUBALLOC_ALLOC * 2 + \
+ + OCFS2_INODE_UPDATE_CREDITS \
+ + OCFS2_XATTR_BLOCK_UPDATE_CREDITS)
+
+/*
+ * Please note that the caller must make sure that root_el is the root
+ * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
+ * the result may be wrong.
+ */
static inline int ocfs2_calc_extend_credits(struct super_block *sb,
- struct ocfs2_dinode *fe,
+ struct ocfs2_extent_list *root_el,
u32 bits_wanted)
{
- int bitmap_blocks, sysfile_bitmap_blocks, dinode_blocks;
+ int bitmap_blocks, sysfile_bitmap_blocks, extent_blocks;
/* bitmap dinode, group desc. + relinked group. */
bitmap_blocks = OCFS2_SUBALLOC_ALLOC;
@@ -354,16 +377,16 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
* however many metadata chunks needed * a remaining suballoc
* alloc. */
sysfile_bitmap_blocks = 1 +
- (OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(fe);
+ (OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(root_el);
/* this does not include *new* metadata blocks, which are
- * accounted for in sysfile_bitmap_blocks. fe +
+ * accounted for in sysfile_bitmap_blocks. root_el +
* prev. last_eb_blk + blocks along edge of tree.
* calc_symlink_credits passes because we just need 1
* credit for the dinode there. */
- dinode_blocks = 1 + 1 + le16_to_cpu(fe->id2.i_list.l_tree_depth);
+ extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth);
- return bitmap_blocks + sysfile_bitmap_blocks + dinode_blocks;
+ return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks;
}
static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
@@ -414,4 +437,16 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
return credits;
}
+static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode)
+{
+ return jbd2_journal_file_inode(handle, &OCFS2_I(inode)->ip_jinode);
+}
+
+static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
+ loff_t new_size)
+{
+ return jbd2_journal_begin_ordered_truncate(&OCFS2_I(inode)->ip_jinode,
+ new_size);
+}
+
#endif /* OCFS2_JOURNAL_H */
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index be774bdc8b3..687b28713c3 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -28,6 +28,7 @@
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/bitops.h>
+#include <linux/debugfs.h>
#define MLOG_MASK_PREFIX ML_DISK_ALLOC
#include <cluster/masklog.h>
@@ -47,8 +48,6 @@
#define OCFS2_LOCAL_ALLOC(dinode) (&((dinode)->id2.i_lab))
-static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb);
-
static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
@@ -75,24 +74,129 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
struct inode *local_alloc_inode);
-static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb)
+#ifdef CONFIG_OCFS2_FS_STATS
+
+static int ocfs2_la_debug_open(struct inode *inode, struct file *file)
+{
+ file->private_data = inode->i_private;
+ return 0;
+}
+
+#define LA_DEBUG_BUF_SZ PAGE_CACHE_SIZE
+#define LA_DEBUG_VER 1
+static ssize_t ocfs2_la_debug_read(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ static DEFINE_MUTEX(la_debug_mutex);
+ struct ocfs2_super *osb = file->private_data;
+ int written, ret;
+ char *buf = osb->local_alloc_debug_buf;
+
+ mutex_lock(&la_debug_mutex);
+ memset(buf, 0, LA_DEBUG_BUF_SZ);
+
+ written = snprintf(buf, LA_DEBUG_BUF_SZ,
+ "0x%x\t0x%llx\t%u\t%u\t0x%x\n",
+ LA_DEBUG_VER,
+ (unsigned long long)osb->la_last_gd,
+ osb->local_alloc_default_bits,
+ osb->local_alloc_bits, osb->local_alloc_state);
+
+ ret = simple_read_from_buffer(userbuf, count, ppos, buf, written);
+
+ mutex_unlock(&la_debug_mutex);
+ return ret;
+}
+
+static const struct file_operations ocfs2_la_debug_fops = {
+ .open = ocfs2_la_debug_open,
+ .read = ocfs2_la_debug_read,
+};
+
+static void ocfs2_init_la_debug(struct ocfs2_super *osb)
+{
+ osb->local_alloc_debug_buf = kmalloc(LA_DEBUG_BUF_SZ, GFP_NOFS);
+ if (!osb->local_alloc_debug_buf)
+ return;
+
+ osb->local_alloc_debug = debugfs_create_file("local_alloc_stats",
+ S_IFREG|S_IRUSR,
+ osb->osb_debug_root,
+ osb,
+ &ocfs2_la_debug_fops);
+ if (!osb->local_alloc_debug) {
+ kfree(osb->local_alloc_debug_buf);
+ osb->local_alloc_debug_buf = NULL;
+ }
+}
+
+static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb)
+{
+ if (osb->local_alloc_debug)
+ debugfs_remove(osb->local_alloc_debug);
+
+ if (osb->local_alloc_debug_buf)
+ kfree(osb->local_alloc_debug_buf);
+
+ osb->local_alloc_debug_buf = NULL;
+ osb->local_alloc_debug = NULL;
+}
+#else /* CONFIG_OCFS2_FS_STATS */
+static void ocfs2_init_la_debug(struct ocfs2_super *osb)
+{
+ return;
+}
+static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb)
+{
+ return;
+}
+#endif
+
+static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb)
{
- BUG_ON(osb->s_clustersize_bits > 20);
+ return (osb->local_alloc_state == OCFS2_LA_THROTTLED ||
+ osb->local_alloc_state == OCFS2_LA_ENABLED);
+}
- /* Size local alloc windows by the megabyte */
- return osb->local_alloc_size << (20 - osb->s_clustersize_bits);
+void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb,
+ unsigned int num_clusters)
+{
+ spin_lock(&osb->osb_lock);
+ if (osb->local_alloc_state == OCFS2_LA_DISABLED ||
+ osb->local_alloc_state == OCFS2_LA_THROTTLED)
+ if (num_clusters >= osb->local_alloc_default_bits) {
+ cancel_delayed_work(&osb->la_enable_wq);
+ osb->local_alloc_state = OCFS2_LA_ENABLED;
+ }
+ spin_unlock(&osb->osb_lock);
+}
+
+void ocfs2_la_enable_worker(struct work_struct *work)
+{
+ struct ocfs2_super *osb =
+ container_of(work, struct ocfs2_super,
+ la_enable_wq.work);
+ spin_lock(&osb->osb_lock);
+ osb->local_alloc_state = OCFS2_LA_ENABLED;
+ spin_unlock(&osb->osb_lock);
}
/*
* Tell us whether a given allocation should use the local alloc
* file. Otherwise, it has to go to the main bitmap.
+ *
+ * This function does semi-dirty reads of local alloc size and state!
+ * This is ok however, as the values are re-checked once under mutex.
*/
int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
{
- int la_bits = ocfs2_local_alloc_window_bits(osb);
int ret = 0;
+ int la_bits;
+
+ spin_lock(&osb->osb_lock);
+ la_bits = osb->local_alloc_bits;
- if (osb->local_alloc_state != OCFS2_LA_ENABLED)
+ if (!ocfs2_la_state_enabled(osb))
goto bail;
/* la_bits should be at least twice the size (in clusters) of
@@ -106,6 +210,7 @@ int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
bail:
mlog(0, "state=%d, bits=%llu, la_bits=%d, ret=%d\n",
osb->local_alloc_state, (unsigned long long)bits, la_bits, ret);
+ spin_unlock(&osb->osb_lock);
return ret;
}
@@ -120,14 +225,18 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
mlog_entry_void();
- if (osb->local_alloc_size == 0)
+ ocfs2_init_la_debug(osb);
+
+ if (osb->local_alloc_bits == 0)
goto bail;
- if (ocfs2_local_alloc_window_bits(osb) >= osb->bitmap_cpg) {
+ if (osb->local_alloc_bits >= osb->bitmap_cpg) {
mlog(ML_NOTICE, "Requested local alloc window %d is larger "
"than max possible %u. Using defaults.\n",
- ocfs2_local_alloc_window_bits(osb), (osb->bitmap_cpg - 1));
- osb->local_alloc_size = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
+ osb->local_alloc_bits, (osb->bitmap_cpg - 1));
+ osb->local_alloc_bits =
+ ocfs2_megabytes_to_clusters(osb->sb,
+ OCFS2_DEFAULT_LOCAL_ALLOC_SIZE);
}
/* read the alloc off disk */
@@ -139,8 +248,8 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
goto bail;
}
- status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno,
- &alloc_bh, 0, inode);
+ status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1,
+ &alloc_bh, OCFS2_BH_IGNORE_CACHE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -185,13 +294,14 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
bail:
if (status < 0)
- if (alloc_bh)
- brelse(alloc_bh);
+ brelse(alloc_bh);
if (inode)
iput(inode);
- mlog(0, "Local alloc window bits = %d\n",
- ocfs2_local_alloc_window_bits(osb));
+ if (status < 0)
+ ocfs2_shutdown_la_debug(osb);
+
+ mlog(0, "Local alloc window bits = %d\n", osb->local_alloc_bits);
mlog_exit(status);
return status;
@@ -217,6 +327,11 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
mlog_entry_void();
+ cancel_delayed_work(&osb->la_enable_wq);
+ flush_workqueue(ocfs2_wq);
+
+ ocfs2_shutdown_la_debug(osb);
+
if (osb->local_alloc_state == OCFS2_LA_UNUSED)
goto out;
@@ -295,8 +410,7 @@ out_commit:
ocfs2_commit_trans(osb, handle);
out_unlock:
- if (main_bm_bh)
- brelse(main_bm_bh);
+ brelse(main_bm_bh);
ocfs2_inode_unlock(main_bm_inode, 1);
@@ -345,8 +459,8 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
mutex_lock(&inode->i_mutex);
- status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno,
- &alloc_bh, 0, inode);
+ status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1,
+ &alloc_bh, OCFS2_BH_IGNORE_CACHE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -372,8 +486,7 @@ bail:
*alloc_copy = NULL;
}
- if (alloc_bh)
- brelse(alloc_bh);
+ brelse(alloc_bh);
if (inode) {
mutex_unlock(&inode->i_mutex);
@@ -441,8 +554,7 @@ out_unlock:
out_mutex:
mutex_unlock(&main_bm_inode->i_mutex);
- if (main_bm_bh)
- brelse(main_bm_bh);
+ brelse(main_bm_bh);
iput(main_bm_inode);
@@ -453,8 +565,48 @@ out:
return status;
}
+/* Check to see if the local alloc window is within ac->ac_max_block */
+static int ocfs2_local_alloc_in_range(struct inode *inode,
+ struct ocfs2_alloc_context *ac,
+ u32 bits_wanted)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_dinode *alloc;
+ struct ocfs2_local_alloc *la;
+ int start;
+ u64 block_off;
+
+ if (!ac->ac_max_block)
+ return 1;
+
+ alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+ la = OCFS2_LOCAL_ALLOC(alloc);
+
+ start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
+ if (start == -1) {
+ mlog_errno(-ENOSPC);
+ return 0;
+ }
+
+ /*
+ * Converting (bm_off + start + bits_wanted) to blocks gives us
+ * the blkno just past our actual allocation. This is perfect
+ * to compare with ac_max_block.
+ */
+ block_off = ocfs2_clusters_to_blocks(inode->i_sb,
+ le32_to_cpu(la->la_bm_off) +
+ start + bits_wanted);
+ mlog(0, "Checking %llu against %llu\n",
+ (unsigned long long)block_off,
+ (unsigned long long)ac->ac_max_block);
+ if (block_off > ac->ac_max_block)
+ return 0;
+
+ return 1;
+}
+
/*
- * make sure we've got at least bitswanted contiguous bits in the
+ * make sure we've got at least bits_wanted contiguous bits in the
* local alloc. You lose them when you drop i_mutex.
*
* We will add ourselves to the transaction passed in, but may start
@@ -485,20 +637,22 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
mutex_lock(&local_alloc_inode->i_mutex);
- if (osb->local_alloc_state != OCFS2_LA_ENABLED) {
- status = -ENOSPC;
- goto bail;
- }
-
- if (bits_wanted > ocfs2_local_alloc_window_bits(osb)) {
- mlog(0, "Asking for more than my max window size!\n");
+ /*
+ * We must double check state and allocator bits because
+ * another process may have changed them while holding i_mutex.
+ */
+ spin_lock(&osb->osb_lock);
+ if (!ocfs2_la_state_enabled(osb) ||
+ (bits_wanted > osb->local_alloc_bits)) {
+ spin_unlock(&osb->osb_lock);
status = -ENOSPC;
goto bail;
}
+ spin_unlock(&osb->osb_lock);
alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
-#ifdef OCFS2_DEBUG_FS
+#ifdef CONFIG_OCFS2_DEBUG_FS
if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
ocfs2_local_alloc_count_bits(alloc)) {
ocfs2_error(osb->sb, "local alloc inode %llu says it has "
@@ -522,6 +676,36 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
mlog_errno(status);
goto bail;
}
+
+ /*
+ * Under certain conditions, the window slide code
+ * might have reduced the number of bits available or
+ * disabled the the local alloc entirely. Re-check
+ * here and return -ENOSPC if necessary.
+ */
+ status = -ENOSPC;
+ if (!ocfs2_la_state_enabled(osb))
+ goto bail;
+
+ free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) -
+ le32_to_cpu(alloc->id1.bitmap1.i_used);
+ if (bits_wanted > free_bits)
+ goto bail;
+ }
+
+ if (ac->ac_max_block)
+ mlog(0, "Calling in_range for max block %llu\n",
+ (unsigned long long)ac->ac_max_block);
+
+ if (!ocfs2_local_alloc_in_range(local_alloc_inode, ac,
+ bits_wanted)) {
+ /*
+ * The window is outside ac->ac_max_block.
+ * This errno tells the caller to keep localalloc enabled
+ * but to get the allocation from the main bitmap.
+ */
+ status = -EFBIG;
+ goto bail;
}
ac->ac_inode = local_alloc_inode;
@@ -789,6 +973,85 @@ bail:
return status;
}
+enum ocfs2_la_event {
+ OCFS2_LA_EVENT_SLIDE, /* Normal window slide. */
+ OCFS2_LA_EVENT_FRAGMENTED, /* The global bitmap has
+ * enough bits theoretically
+ * free, but a contiguous
+ * allocation could not be
+ * found. */
+ OCFS2_LA_EVENT_ENOSPC, /* Global bitmap doesn't have
+ * enough bits free to satisfy
+ * our request. */
+};
+#define OCFS2_LA_ENABLE_INTERVAL (30 * HZ)
+/*
+ * Given an event, calculate the size of our next local alloc window.
+ *
+ * This should always be called under i_mutex of the local alloc inode
+ * so that local alloc disabling doesn't race with processes trying to
+ * use the allocator.
+ *
+ * Returns the state which the local alloc was left in. This value can
+ * be ignored by some paths.
+ */
+static int ocfs2_recalc_la_window(struct ocfs2_super *osb,
+ enum ocfs2_la_event event)
+{
+ unsigned int bits;
+ int state;
+
+ spin_lock(&osb->osb_lock);
+ if (osb->local_alloc_state == OCFS2_LA_DISABLED) {
+ WARN_ON_ONCE(osb->local_alloc_state == OCFS2_LA_DISABLED);
+ goto out_unlock;
+ }
+
+ /*
+ * ENOSPC and fragmentation are treated similarly for now.
+ */
+ if (event == OCFS2_LA_EVENT_ENOSPC ||
+ event == OCFS2_LA_EVENT_FRAGMENTED) {
+ /*
+ * We ran out of contiguous space in the primary
+ * bitmap. Drastically reduce the number of bits used
+ * by local alloc until we have to disable it.
+ */
+ bits = osb->local_alloc_bits >> 1;
+ if (bits > ocfs2_megabytes_to_clusters(osb->sb, 1)) {
+ /*
+ * By setting state to THROTTLED, we'll keep
+ * the number of local alloc bits used down
+ * until an event occurs which would give us
+ * reason to assume the bitmap situation might
+ * have changed.
+ */
+ osb->local_alloc_state = OCFS2_LA_THROTTLED;
+ osb->local_alloc_bits = bits;
+ } else {
+ osb->local_alloc_state = OCFS2_LA_DISABLED;
+ }
+ queue_delayed_work(ocfs2_wq, &osb->la_enable_wq,
+ OCFS2_LA_ENABLE_INTERVAL);
+ goto out_unlock;
+ }
+
+ /*
+ * Don't increase the size of the local alloc window until we
+ * know we might be able to fulfill the request. Otherwise, we
+ * risk bouncing around the global bitmap during periods of
+ * low space.
+ */
+ if (osb->local_alloc_state != OCFS2_LA_THROTTLED)
+ osb->local_alloc_bits = osb->local_alloc_default_bits;
+
+out_unlock:
+ state = osb->local_alloc_state;
+ spin_unlock(&osb->osb_lock);
+
+ return state;
+}
+
static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
struct ocfs2_alloc_context **ac,
struct inode **bitmap_inode,
@@ -803,12 +1066,21 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
goto bail;
}
- (*ac)->ac_bits_wanted = ocfs2_local_alloc_window_bits(osb);
+retry_enospc:
+ (*ac)->ac_bits_wanted = osb->local_alloc_bits;
status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
+ if (status == -ENOSPC) {
+ if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) ==
+ OCFS2_LA_DISABLED)
+ goto bail;
+
+ ocfs2_free_ac_resource(*ac);
+ memset(*ac, 0, sizeof(struct ocfs2_alloc_context));
+ goto retry_enospc;
+ }
if (status < 0) {
- if (status != -ENOSPC)
- mlog_errno(status);
+ mlog_errno(status);
goto bail;
}
@@ -849,7 +1121,7 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
"one\n");
mlog(0, "Allocating %u clusters for a new window.\n",
- ocfs2_local_alloc_window_bits(osb));
+ osb->local_alloc_bits);
/* Instruct the allocation code to try the most recently used
* cluster group. We'll re-record the group used this pass
@@ -859,9 +1131,36 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
/* we used the generic suballoc reserve function, but we set
* everything up nicely, so there's no reason why we can't use
* the more specific cluster api to claim bits. */
- status = ocfs2_claim_clusters(osb, handle, ac,
- ocfs2_local_alloc_window_bits(osb),
+ status = ocfs2_claim_clusters(osb, handle, ac, osb->local_alloc_bits,
&cluster_off, &cluster_count);
+ if (status == -ENOSPC) {
+retry_enospc:
+ /*
+ * Note: We could also try syncing the journal here to
+ * allow use of any free bits which the current
+ * transaction can't give us access to. --Mark
+ */
+ if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_FRAGMENTED) ==
+ OCFS2_LA_DISABLED)
+ goto bail;
+
+ status = ocfs2_claim_clusters(osb, handle, ac,
+ osb->local_alloc_bits,
+ &cluster_off,
+ &cluster_count);
+ if (status == -ENOSPC)
+ goto retry_enospc;
+ /*
+ * We only shrunk the *minimum* number of in our
+ * request - it's entirely possible that the allocator
+ * might give us more than we asked for.
+ */
+ if (status == 0) {
+ spin_lock(&osb->osb_lock);
+ osb->local_alloc_bits = cluster_count;
+ spin_unlock(&osb->osb_lock);
+ }
+ }
if (status < 0) {
if (status != -ENOSPC)
mlog_errno(status);
@@ -905,6 +1204,8 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
mlog_entry_void();
+ ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_SLIDE);
+
/* This will lock the main bitmap for us. */
status = ocfs2_local_alloc_reserve_for_window(osb,
&ac,
@@ -976,8 +1277,7 @@ bail:
if (handle)
ocfs2_commit_trans(osb, handle);
- if (main_bm_bh)
- brelse(main_bm_bh);
+ brelse(main_bm_bh);
if (main_bm_inode)
iput(main_bm_inode);
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
index 3f76631e110..ac5ea9f8665 100644
--- a/fs/ocfs2/localalloc.h
+++ b/fs/ocfs2/localalloc.h
@@ -52,4 +52,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
u32 *bit_off,
u32 *num_bits);
+void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb,
+ unsigned int num_clusters);
+void ocfs2_la_enable_worker(struct work_struct *work);
+
#endif /* OCFS2_LOCALALLOC_H */
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index 203f8714387..544ac624517 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -24,6 +24,7 @@
*/
#include <linux/fs.h>
+#include <linux/fcntl.h>
#define MLOG_MASK_PREFIX ML_INODE
#include <cluster/masklog.h>
@@ -32,6 +33,7 @@
#include "dlmglue.h"
#include "file.h"
+#include "inode.h"
#include "locks.h"
static int ocfs2_do_flock(struct file *file, struct inode *inode,
@@ -123,3 +125,16 @@ int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl)
else
return ocfs2_do_flock(file, inode, cmd, fl);
}
+
+int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (!(fl->fl_flags & FL_POSIX))
+ return -ENOLCK;
+ if (__mandatory_lock(inode))
+ return -ENOLCK;
+
+ return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl);
+}
diff --git a/fs/ocfs2/locks.h b/fs/ocfs2/locks.h
index 9743ef2324e..496d488b271 100644
--- a/fs/ocfs2/locks.h
+++ b/fs/ocfs2/locks.h
@@ -27,5 +27,6 @@
#define OCFS2_LOCKS_H
int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl);
+int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl);
#endif /* OCFS2_LOCKS_H */
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index d5d808fe014..485a6aa0ad3 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -60,6 +60,7 @@
#include "symlink.h"
#include "sysfile.h"
#include "uptodate.h"
+#include "xattr.h"
#include "buffer_head_io.h"
@@ -327,14 +328,9 @@ leave:
if (status == -ENOSPC)
mlog(0, "Disk is full\n");
- if (new_fe_bh)
- brelse(new_fe_bh);
-
- if (de_bh)
- brelse(de_bh);
-
- if (parent_fe_bh)
- brelse(parent_fe_bh);
+ brelse(new_fe_bh);
+ brelse(de_bh);
+ brelse(parent_fe_bh);
if ((status < 0) && inode)
iput(inode);
@@ -647,12 +643,9 @@ out_unlock_inode:
out:
ocfs2_inode_unlock(dir, 1);
- if (de_bh)
- brelse(de_bh);
- if (fe_bh)
- brelse(fe_bh);
- if (parent_fe_bh)
- brelse(parent_fe_bh);
+ brelse(de_bh);
+ brelse(fe_bh);
+ brelse(parent_fe_bh);
mlog_exit(err);
@@ -851,17 +844,10 @@ leave:
iput(orphan_dir);
}
- if (fe_bh)
- brelse(fe_bh);
-
- if (dirent_bh)
- brelse(dirent_bh);
-
- if (parent_node_bh)
- brelse(parent_node_bh);
-
- if (orphan_entry_bh)
- brelse(orphan_entry_bh);
+ brelse(fe_bh);
+ brelse(dirent_bh);
+ brelse(parent_node_bh);
+ brelse(orphan_entry_bh);
mlog_exit(status);
@@ -1372,24 +1358,15 @@ bail:
if (new_inode)
iput(new_inode);
- if (newfe_bh)
- brelse(newfe_bh);
- if (old_inode_bh)
- brelse(old_inode_bh);
- if (old_dir_bh)
- brelse(old_dir_bh);
- if (new_dir_bh)
- brelse(new_dir_bh);
- if (new_de_bh)
- brelse(new_de_bh);
- if (old_de_bh)
- brelse(old_de_bh);
- if (old_inode_de_bh)
- brelse(old_inode_de_bh);
- if (orphan_entry_bh)
- brelse(orphan_entry_bh);
- if (insert_entry_bh)
- brelse(insert_entry_bh);
+ brelse(newfe_bh);
+ brelse(old_inode_bh);
+ brelse(old_dir_bh);
+ brelse(new_dir_bh);
+ brelse(new_de_bh);
+ brelse(old_de_bh);
+ brelse(old_inode_de_bh);
+ brelse(orphan_entry_bh);
+ brelse(insert_entry_bh);
mlog_exit(status);
@@ -1492,8 +1469,7 @@ bail:
if (bhs) {
for(i = 0; i < blocks; i++)
- if (bhs[i])
- brelse(bhs[i]);
+ brelse(bhs[i]);
kfree(bhs);
}
@@ -1598,10 +1574,10 @@ static int ocfs2_symlink(struct inode *dir,
u32 offset = 0;
inode->i_op = &ocfs2_symlink_inode_operations;
- status = ocfs2_do_extend_allocation(osb, inode, &offset, 1, 0,
- new_fe_bh,
- handle, data_ac, NULL,
- NULL);
+ status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0,
+ new_fe_bh,
+ handle, data_ac, NULL,
+ NULL);
if (status < 0) {
if (status != -ENOSPC && status != -EINTR) {
mlog(ML_ERROR,
@@ -1659,12 +1635,9 @@ bail:
ocfs2_inode_unlock(dir, 1);
- if (new_fe_bh)
- brelse(new_fe_bh);
- if (parent_fe_bh)
- brelse(parent_fe_bh);
- if (de_bh)
- brelse(de_bh);
+ brelse(new_fe_bh);
+ brelse(parent_fe_bh);
+ brelse(de_bh);
if (inode_ac)
ocfs2_free_alloc_context(inode_ac);
if (data_ac)
@@ -1759,8 +1732,7 @@ leave:
iput(orphan_dir_inode);
}
- if (orphan_dir_bh)
- brelse(orphan_dir_bh);
+ brelse(orphan_dir_bh);
mlog_exit(status);
return status;
@@ -1780,10 +1752,9 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
- status = ocfs2_read_block(osb,
+ status = ocfs2_read_block(orphan_dir_inode,
OCFS2_I(orphan_dir_inode)->ip_blkno,
- &orphan_dir_bh, OCFS2_BH_CACHED,
- orphan_dir_inode);
+ &orphan_dir_bh);
if (status < 0) {
mlog_errno(status);
goto leave;
@@ -1829,8 +1800,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
(unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
leave:
- if (orphan_dir_bh)
- brelse(orphan_dir_bh);
+ brelse(orphan_dir_bh);
mlog_exit(status);
return status;
@@ -1898,8 +1868,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
}
leave:
- if (target_de_bh)
- brelse(target_de_bh);
+ brelse(target_de_bh);
mlog_exit(status);
return status;
@@ -1918,4 +1887,8 @@ const struct inode_operations ocfs2_dir_iops = {
.setattr = ocfs2_setattr,
.getattr = ocfs2_getattr,
.permission = ocfs2_permission,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = ocfs2_listxattr,
+ .removexattr = generic_removexattr,
};
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 31692379c17..a21a465490c 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -34,7 +34,12 @@
#include <linux/workqueue.h>
#include <linux/kref.h>
#include <linux/mutex.h>
-#include <linux/jbd.h>
+#ifndef CONFIG_OCFS2_COMPAT_JBD
+# include <linux/jbd2.h>
+#else
+# include <linux/jbd.h>
+# include "ocfs2_jbd_compat.h"
+#endif
/* For union ocfs2_dlm_lksb */
#include "stackglue.h"
@@ -132,6 +137,18 @@ struct ocfs2_lock_res {
wait_queue_head_t l_event;
struct list_head l_debug_list;
+
+#ifdef CONFIG_OCFS2_FS_STATS
+ unsigned long long l_lock_num_prmode; /* PR acquires */
+ unsigned long long l_lock_num_exmode; /* EX acquires */
+ unsigned int l_lock_num_prmode_failed; /* Failed PR gets */
+ unsigned int l_lock_num_exmode_failed; /* Failed EX gets */
+ unsigned long long l_lock_total_prmode; /* Tot wait for PR */
+ unsigned long long l_lock_total_exmode; /* Tot wait for EX */
+ unsigned int l_lock_max_prmode; /* Max wait for PR */
+ unsigned int l_lock_max_exmode; /* Max wait for EX */
+ unsigned int l_lock_refresh; /* Disk refreshes */
+#endif
};
struct ocfs2_dlm_debug {
@@ -159,9 +176,13 @@ struct ocfs2_alloc_stats
enum ocfs2_local_alloc_state
{
- OCFS2_LA_UNUSED = 0,
- OCFS2_LA_ENABLED,
- OCFS2_LA_DISABLED
+ OCFS2_LA_UNUSED = 0, /* Local alloc will never be used for
+ * this mountpoint. */
+ OCFS2_LA_ENABLED, /* Local alloc is in use. */
+ OCFS2_LA_THROTTLED, /* Local alloc is in use, but number
+ * of bits has been reduced. */
+ OCFS2_LA_DISABLED /* Local alloc has temporarily been
+ * disabled. */
};
enum ocfs2_mount_options
@@ -172,6 +193,8 @@ enum ocfs2_mount_options
OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */
OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */
OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
+ OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */
+ OCFS2_MOUNT_INODE64 = 1 << 7, /* Allow inode numbers > 2^32 */
};
#define OCFS2_OSB_SOFT_RO 0x0001
@@ -192,6 +215,8 @@ struct ocfs2_super
struct ocfs2_slot_info *slot_info;
+ u32 *slot_recovery_generations;
+
spinlock_t node_map_lock;
u64 root_blkno;
@@ -200,6 +225,7 @@ struct ocfs2_super
u32 bitmap_cpg;
u8 *uuid;
char *uuid_str;
+ u32 uuid_hash;
u8 *vol_label;
u64 first_cluster_group_blkno;
u32 fs_generation;
@@ -227,6 +253,7 @@ struct ocfs2_super
int s_sectsize_bits;
int s_clustersize;
int s_clustersize_bits;
+ unsigned int s_xattr_inline_size;
atomic_t vol_state;
struct mutex recovery_lock;
@@ -238,11 +265,27 @@ struct ocfs2_super
struct ocfs2_journal *journal;
unsigned long osb_commit_interval;
- int local_alloc_size;
- enum ocfs2_local_alloc_state local_alloc_state;
+ struct delayed_work la_enable_wq;
+
+ /*
+ * Must hold local alloc i_mutex and osb->osb_lock to change
+ * local_alloc_bits. Reads can be done under either lock.
+ */
+ unsigned int local_alloc_bits;
+ unsigned int local_alloc_default_bits;
+
+ enum ocfs2_local_alloc_state local_alloc_state; /* protected
+ * by osb_lock */
+
struct buffer_head *local_alloc_bh;
+
u64 la_last_gd;
+#ifdef CONFIG_OCFS2_FS_STATS
+ struct dentry *local_alloc_debug;
+ char *local_alloc_debug_buf;
+#endif
+
/* Next two fields are for local node slot recovery during
* mount. */
int dirty;
@@ -326,6 +369,13 @@ static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb)
return 0;
}
+static inline int ocfs2_supports_xattr(struct ocfs2_super *osb)
+{
+ if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR)
+ return 1;
+ return 0;
+}
+
/* set / clear functions because cluster events can make these happen
* in parallel so we want the transitions to be atomic. this also
* means that any future flags osb_flags must be protected by spinlock
@@ -540,6 +590,14 @@ static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
return pages_per_cluster;
}
+static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb,
+ unsigned int megs)
+{
+ BUILD_BUG_ON(OCFS2_MAX_CLUSTERSIZE > 1048576);
+
+ return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits);
+}
+
static inline void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
{
spin_lock(&osb->osb_lock);
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 52c42666515..f24ce3d3f95 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -64,6 +64,7 @@
#define OCFS2_INODE_SIGNATURE "INODE01"
#define OCFS2_EXTENT_BLOCK_SIGNATURE "EXBLK01"
#define OCFS2_GROUP_DESC_SIGNATURE "GROUP01"
+#define OCFS2_XATTR_BLOCK_SIGNATURE "XATTR01"
/* Compatibility flags */
#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \
@@ -90,7 +91,8 @@
| OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \
| OCFS2_FEATURE_INCOMPAT_INLINE_DATA \
| OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
- | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK)
+ | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
+ | OCFS2_FEATURE_INCOMPAT_XATTR)
#define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN
/*
@@ -127,10 +129,6 @@
/* Support for data packed into inode blocks */
#define OCFS2_FEATURE_INCOMPAT_INLINE_DATA 0x0040
-/* Support for the extended slot map */
-#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100
-
-
/*
* Support for alternate, userspace cluster stacks. If set, the superblock
* field s_cluster_info contains a tag for the alternate stack in use as
@@ -142,6 +140,12 @@
*/
#define OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK 0x0080
+/* Support for the extended slot map */
+#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100
+
+/* Support for extended attributes */
+#define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200
+
/*
* backup superblock flag is used to indicate that this volume
* has backup superblocks.
@@ -299,6 +303,12 @@ struct ocfs2_new_group_input {
*/
#define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE 8
+/*
+ * Inline extended attribute size (in bytes)
+ * The value chosen should be aligned to 16 byte boundaries.
+ */
+#define OCFS2_MIN_XATTR_INLINE_SIZE 256
+
struct ocfs2_system_inode_info {
char *si_name;
int si_iflags;
@@ -563,7 +573,7 @@ struct ocfs2_super_block {
/*40*/ __le16 s_max_slots; /* Max number of simultaneous mounts
before tunefs required */
__le16 s_tunefs_flag;
- __le32 s_reserved1;
+ __le32 s_uuid_hash; /* hash value of uuid */
__le64 s_first_cluster_group; /* Block offset of 1st cluster
* group header */
/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */
@@ -571,7 +581,11 @@ struct ocfs2_super_block {
/*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Selected userspace
stack. Only valid
with INCOMPAT flag. */
-/*B8*/ __le64 s_reserved2[17]; /* Fill out superblock */
+/*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size
+ for this fs*/
+ __le16 s_reserved0;
+ __le32 s_reserved1;
+/*C0*/ __le64 s_reserved2[16]; /* Fill out superblock */
/*140*/
/*
@@ -621,7 +635,8 @@ struct ocfs2_dinode {
belongs to */
__le16 i_suballoc_bit; /* Bit offset in suballocator
block group */
-/*10*/ __le32 i_reserved0;
+/*10*/ __le16 i_reserved0;
+ __le16 i_xattr_inline_size;
__le32 i_clusters; /* Cluster count */
__le32 i_uid; /* Owner UID */
__le32 i_gid; /* Owning GID */
@@ -640,11 +655,12 @@ struct ocfs2_dinode {
__le32 i_atime_nsec;
__le32 i_ctime_nsec;
__le32 i_mtime_nsec;
- __le32 i_attr;
+/*70*/ __le32 i_attr;
__le16 i_orphaned_slot; /* Only valid when OCFS2_ORPHANED_FL
was set in i_flags */
__le16 i_dyn_features;
-/*70*/ __le64 i_reserved2[8];
+ __le64 i_xattr_loc;
+/*80*/ __le64 i_reserved2[7];
/*B8*/ union {
__le64 i_pad1; /* Generic way to refer to this
64bit union */
@@ -660,7 +676,10 @@ struct ocfs2_dinode {
struct { /* Info for journal system
inodes */
__le32 ij_flags; /* Mounted, version, etc. */
- __le32 ij_pad;
+ __le32 ij_recovery_generation; /* Incremented when the
+ journal is recovered
+ after an unclean
+ shutdown */
} journal1;
} id1; /* Inode type dependant 1 */
/*C0*/ union {
@@ -712,6 +731,136 @@ struct ocfs2_group_desc
/*40*/ __u8 bg_bitmap[0];
};
+/*
+ * On disk extended attribute structure for OCFS2.
+ */
+
+/*
+ * ocfs2_xattr_entry indicates one extend attribute.
+ *
+ * Note that it can be stored in inode, one block or one xattr bucket.
+ */
+struct ocfs2_xattr_entry {
+ __le32 xe_name_hash; /* hash value of xattr prefix+suffix. */
+ __le16 xe_name_offset; /* byte offset from the 1st etnry in the local
+ local xattr storage(inode, xattr block or
+ xattr bucket). */
+ __u8 xe_name_len; /* xattr name len, does't include prefix. */
+ __u8 xe_type; /* the low 7 bits indicates the name prefix's
+ * type and the highest 1 bits indicate whether
+ * the EA is stored in the local storage. */
+ __le64 xe_value_size; /* real xattr value length. */
+};
+
+/*
+ * On disk structure for xattr header.
+ *
+ * One ocfs2_xattr_header describes how many ocfs2_xattr_entry records in
+ * the local xattr storage.
+ */
+struct ocfs2_xattr_header {
+ __le16 xh_count; /* contains the count of how
+ many records are in the
+ local xattr storage. */
+ __le16 xh_free_start; /* current offset for storing
+ xattr. */
+ __le16 xh_name_value_len; /* total length of name/value
+ length in this bucket. */
+ __le16 xh_num_buckets; /* bucket nums in one extent
+ record, only valid in the
+ first bucket. */
+ __le64 xh_csum;
+ struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */
+};
+
+/*
+ * On disk structure for xattr value root.
+ *
+ * It is used when one extended attribute's size is larger, and we will save it
+ * in an outside cluster. It will stored in a b-tree like file content.
+ */
+struct ocfs2_xattr_value_root {
+/*00*/ __le32 xr_clusters; /* clusters covered by xattr value. */
+ __le32 xr_reserved0;
+ __le64 xr_last_eb_blk; /* Pointer to last extent block */
+/*10*/ struct ocfs2_extent_list xr_list; /* Extent record list */
+};
+
+/*
+ * On disk structure for xattr tree root.
+ *
+ * It is used when there are too many extended attributes for one file. These
+ * attributes will be organized and stored in an indexed-btree.
+ */
+struct ocfs2_xattr_tree_root {
+/*00*/ __le32 xt_clusters; /* clusters covered by xattr. */
+ __le32 xt_reserved0;
+ __le64 xt_last_eb_blk; /* Pointer to last extent block */
+/*10*/ struct ocfs2_extent_list xt_list; /* Extent record list */
+};
+
+#define OCFS2_XATTR_INDEXED 0x1
+#define OCFS2_HASH_SHIFT 5
+#define OCFS2_XATTR_ROUND 3
+#define OCFS2_XATTR_SIZE(size) (((size) + OCFS2_XATTR_ROUND) & \
+ ~(OCFS2_XATTR_ROUND))
+
+#define OCFS2_XATTR_BUCKET_SIZE 4096
+#define OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET (OCFS2_XATTR_BUCKET_SIZE \
+ / OCFS2_MIN_BLOCKSIZE)
+
+/*
+ * On disk structure for xattr block.
+ */
+struct ocfs2_xattr_block {
+/*00*/ __u8 xb_signature[8]; /* Signature for verification */
+ __le16 xb_suballoc_slot; /* Slot suballocator this
+ block belongs to. */
+ __le16 xb_suballoc_bit; /* Bit offset in suballocator
+ block group */
+ __le32 xb_fs_generation; /* Must match super block */
+/*10*/ __le64 xb_blkno; /* Offset on disk, in blocks */
+ __le64 xb_csum;
+/*20*/ __le16 xb_flags; /* Indicates whether this block contains
+ real xattr or a xattr tree. */
+ __le16 xb_reserved0;
+ __le32 xb_reserved1;
+ __le64 xb_reserved2;
+/*30*/ union {
+ struct ocfs2_xattr_header xb_header; /* xattr header if this
+ block contains xattr */
+ struct ocfs2_xattr_tree_root xb_root;/* xattr tree root if this
+ block cotains xattr
+ tree. */
+ } xb_attrs;
+};
+
+#define OCFS2_XATTR_ENTRY_LOCAL 0x80
+#define OCFS2_XATTR_TYPE_MASK 0x7F
+static inline void ocfs2_xattr_set_local(struct ocfs2_xattr_entry *xe,
+ int local)
+{
+ if (local)
+ xe->xe_type |= OCFS2_XATTR_ENTRY_LOCAL;
+ else
+ xe->xe_type &= ~OCFS2_XATTR_ENTRY_LOCAL;
+}
+
+static inline int ocfs2_xattr_is_local(struct ocfs2_xattr_entry *xe)
+{
+ return xe->xe_type & OCFS2_XATTR_ENTRY_LOCAL;
+}
+
+static inline void ocfs2_xattr_set_type(struct ocfs2_xattr_entry *xe, int type)
+{
+ xe->xe_type |= type & OCFS2_XATTR_TYPE_MASK;
+}
+
+static inline int ocfs2_xattr_get_type(struct ocfs2_xattr_entry *xe)
+{
+ return xe->xe_type & OCFS2_XATTR_TYPE_MASK;
+}
+
#ifdef __KERNEL__
static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
{
@@ -725,6 +874,20 @@ static inline int ocfs2_max_inline_data(struct super_block *sb)
offsetof(struct ocfs2_dinode, id2.i_data.id_data);
}
+static inline int ocfs2_max_inline_data_with_xattr(struct super_block *sb,
+ struct ocfs2_dinode *di)
+{
+ unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
+
+ if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
+ return sb->s_blocksize -
+ offsetof(struct ocfs2_dinode, id2.i_data.id_data) -
+ xattrsize;
+ else
+ return sb->s_blocksize -
+ offsetof(struct ocfs2_dinode, id2.i_data.id_data);
+}
+
static inline int ocfs2_extent_recs_per_inode(struct super_block *sb)
{
int size;
@@ -735,6 +898,24 @@ static inline int ocfs2_extent_recs_per_inode(struct super_block *sb)
return size / sizeof(struct ocfs2_extent_rec);
}
+static inline int ocfs2_extent_recs_per_inode_with_xattr(
+ struct super_block *sb,
+ struct ocfs2_dinode *di)
+{
+ int size;
+ unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
+
+ if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
+ size = sb->s_blocksize -
+ offsetof(struct ocfs2_dinode, id2.i_list.l_recs) -
+ xattrsize;
+ else
+ size = sb->s_blocksize -
+ offsetof(struct ocfs2_dinode, id2.i_list.l_recs);
+
+ return size / sizeof(struct ocfs2_extent_rec);
+}
+
static inline int ocfs2_chain_recs_per_inode(struct super_block *sb)
{
int size;
@@ -798,6 +979,17 @@ static inline u64 ocfs2_backup_super_blkno(struct super_block *sb, int index)
return 0;
}
+
+static inline u16 ocfs2_xattr_recs_per_xb(struct super_block *sb)
+{
+ int size;
+
+ size = sb->s_blocksize -
+ offsetof(struct ocfs2_xattr_block,
+ xb_attrs.xb_root.xt_list.l_recs);
+
+ return size / sizeof(struct ocfs2_extent_rec);
+}
#else
static inline int ocfs2_fast_symlink_chars(int blocksize)
{
@@ -881,6 +1073,17 @@ static inline uint64_t ocfs2_backup_super_blkno(int blocksize, int index)
return 0;
}
+
+static inline int ocfs2_xattr_recs_per_xb(int blocksize)
+{
+ int size;
+
+ size = blocksize -
+ offsetof(struct ocfs2_xattr_block,
+ xb_attrs.xb_root.xt_list.l_recs);
+
+ return size / sizeof(struct ocfs2_extent_rec);
+}
#endif /* __KERNEL__ */
@@ -901,7 +1104,7 @@ static inline int ocfs2_sprintf_system_inode_name(char *buf, int len,
* list has a copy per slot.
*/
if (type <= OCFS2_LAST_GLOBAL_SYSTEM_INODE)
- chars = snprintf(buf, len,
+ chars = snprintf(buf, len, "%s",
ocfs2_system_inodes[type].si_name);
else
chars = snprintf(buf, len,
diff --git a/fs/ocfs2/ocfs2_jbd_compat.h b/fs/ocfs2/ocfs2_jbd_compat.h
new file mode 100644
index 00000000000..b91c78f8f55
--- /dev/null
+++ b/fs/ocfs2/ocfs2_jbd_compat.h
@@ -0,0 +1,82 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * ocfs2_jbd_compat.h
+ *
+ * Compatibility defines for JBD.
+ *
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OCFS2_JBD_COMPAT_H
+#define OCFS2_JBD_COMPAT_H
+
+#ifndef CONFIG_OCFS2_COMPAT_JBD
+# error Should not have been included
+#endif
+
+struct jbd2_inode {
+ unsigned int dummy;
+};
+
+#define JBD2_BARRIER JFS_BARRIER
+#define JBD2_DEFAULT_MAX_COMMIT_AGE JBD_DEFAULT_MAX_COMMIT_AGE
+
+#define jbd2_journal_ack_err journal_ack_err
+#define jbd2_journal_clear_err journal_clear_err
+#define jbd2_journal_destroy journal_destroy
+#define jbd2_journal_dirty_metadata journal_dirty_metadata
+#define jbd2_journal_errno journal_errno
+#define jbd2_journal_extend journal_extend
+#define jbd2_journal_flush journal_flush
+#define jbd2_journal_force_commit journal_force_commit
+#define jbd2_journal_get_write_access journal_get_write_access
+#define jbd2_journal_get_undo_access journal_get_undo_access
+#define jbd2_journal_init_inode journal_init_inode
+#define jbd2_journal_invalidatepage journal_invalidatepage
+#define jbd2_journal_load journal_load
+#define jbd2_journal_lock_updates journal_lock_updates
+#define jbd2_journal_restart journal_restart
+#define jbd2_journal_start journal_start
+#define jbd2_journal_start_commit journal_start_commit
+#define jbd2_journal_stop journal_stop
+#define jbd2_journal_try_to_free_buffers journal_try_to_free_buffers
+#define jbd2_journal_unlock_updates journal_unlock_updates
+#define jbd2_journal_wipe journal_wipe
+#define jbd2_log_wait_commit log_wait_commit
+
+static inline int jbd2_journal_file_inode(handle_t *handle,
+ struct jbd2_inode *inode)
+{
+ return 0;
+}
+
+static inline int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
+ loff_t new_size)
+{
+ return 0;
+}
+
+static inline void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode,
+ struct inode *inode)
+{
+ return;
+}
+
+static inline void jbd2_journal_release_jbd_inode(journal_t *journal,
+ struct jbd2_inode *jinode)
+{
+ return;
+}
+
+
+#endif /* OCFS2_JBD_COMPAT_H */
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 8166968e901..ffd48db229a 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -200,7 +200,7 @@ static int update_backups(struct inode * inode, u32 clusters, char *data)
if (cluster > clusters)
break;
- ret = ocfs2_read_block(osb, blkno, &backup, 0, NULL);
+ ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup);
if (ret < 0) {
mlog_errno(ret);
break;
@@ -236,8 +236,8 @@ static void ocfs2_update_super_and_backups(struct inode *inode,
* update the superblock last.
* It doesn't matter if the write failed.
*/
- ret = ocfs2_read_block(osb, OCFS2_SUPER_BLOCK_BLKNO,
- &super_bh, 0, NULL);
+ ret = ocfs2_read_blocks_sync(osb, OCFS2_SUPER_BLOCK_BLKNO, 1,
+ &super_bh);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -332,8 +332,7 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
lgd_blkno = ocfs2_which_cluster_group(main_bm_inode,
first_new_cluster - 1);
- ret = ocfs2_read_block(osb, lgd_blkno, &group_bh, OCFS2_BH_CACHED,
- main_bm_inode);
+ ret = ocfs2_read_block(main_bm_inode, lgd_blkno, &group_bh);
if (ret < 0) {
mlog_errno(ret);
goto out_unlock;
@@ -540,7 +539,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
goto out_unlock;
}
- ret = ocfs2_read_block(osb, input->group, &group_bh, 0, NULL);
+ ret = ocfs2_read_blocks_sync(osb, input->group, 1, &group_bh);
if (ret < 0) {
mlog(ML_ERROR, "Can't read the group descriptor # %llu "
"from the device.", (unsigned long long)input->group);
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index bb5ff8939bf..bdda2d8f850 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -150,8 +150,8 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
* be !NULL. Thus, ocfs2_read_blocks() will ignore blocknr. If
* this is not true, the read of -1 (UINT64_MAX) will fail.
*/
- ret = ocfs2_read_blocks(osb, -1, si->si_blocks, si->si_bh, 0,
- si->si_inode);
+ ret = ocfs2_read_blocks(si->si_inode, -1, si->si_blocks, si->si_bh,
+ OCFS2_BH_IGNORE_CACHE);
if (ret == 0) {
spin_lock(&osb->osb_lock);
ocfs2_update_slot_info(si);
@@ -404,7 +404,8 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
(unsigned long long)blkno);
bh = NULL; /* Acquire a fresh bh */
- status = ocfs2_read_block(osb, blkno, &bh, 0, si->si_inode);
+ status = ocfs2_read_blocks(si->si_inode, blkno, 1, &bh,
+ OCFS2_BH_IGNORE_CACHE);
if (status < 0) {
mlog_errno(status);
goto bail;
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index c021280dd46..faec2d87935 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -21,12 +21,14 @@
#include <linux/fs.h>
#include <linux/miscdevice.h>
#include <linux/mutex.h>
+#include <linux/smp_lock.h>
#include <linux/reboot.h>
#include <asm/uaccess.h>
#include "ocfs2.h" /* For struct ocfs2_lock_res */
#include "stackglue.h"
+#include <linux/dlm_plock.h>
/*
* The control protocol starts with a handshake. Until the handshake
@@ -549,26 +551,17 @@ static ssize_t ocfs2_control_read(struct file *file,
size_t count,
loff_t *ppos)
{
- char *proto_string = OCFS2_CONTROL_PROTO;
- size_t to_write = 0;
-
- if (*ppos >= OCFS2_CONTROL_PROTO_LEN)
- return 0;
-
- to_write = OCFS2_CONTROL_PROTO_LEN - *ppos;
- if (to_write > count)
- to_write = count;
- if (copy_to_user(buf, proto_string + *ppos, to_write))
- return -EFAULT;
+ ssize_t ret;
- *ppos += to_write;
+ ret = simple_read_from_buffer(buf, count, ppos,
+ OCFS2_CONTROL_PROTO, OCFS2_CONTROL_PROTO_LEN);
/* Have we read the whole protocol list? */
- if (*ppos >= OCFS2_CONTROL_PROTO_LEN)
+ if (ret > 0 && *ppos >= OCFS2_CONTROL_PROTO_LEN)
ocfs2_control_set_handshake_state(file,
OCFS2_CONTROL_HANDSHAKE_READ);
- return to_write;
+ return ret;
}
static int ocfs2_control_release(struct inode *inode, struct file *file)
@@ -619,10 +612,12 @@ static int ocfs2_control_open(struct inode *inode, struct file *file)
return -ENOMEM;
p->op_this_node = -1;
+ lock_kernel();
mutex_lock(&ocfs2_control_lock);
file->private_data = p;
list_add(&p->op_list, &ocfs2_control_private_list);
mutex_unlock(&ocfs2_control_lock);
+ unlock_kernel();
return 0;
}
@@ -752,6 +747,37 @@ static void user_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
{
}
+static int user_plock(struct ocfs2_cluster_connection *conn,
+ u64 ino,
+ struct file *file,
+ int cmd,
+ struct file_lock *fl)
+{
+ /*
+ * This more or less just demuxes the plock request into any
+ * one of three dlm calls.
+ *
+ * Internally, fs/dlm will pass these to a misc device, which
+ * a userspace daemon will read and write to.
+ *
+ * For now, cancel requests (which happen internally only),
+ * are turned into unlocks. Most of this function taken from
+ * gfs2_lock.
+ */
+
+ if (cmd == F_CANCELLK) {
+ cmd = F_SETLK;
+ fl->fl_type = F_UNLCK;
+ }
+
+ if (IS_GETLK(cmd))
+ return dlm_posix_get(conn->cc_lockspace, ino, file, fl);
+ else if (fl->fl_type == F_UNLCK)
+ return dlm_posix_unlock(conn->cc_lockspace, ino, file, fl);
+ else
+ return dlm_posix_lock(conn->cc_lockspace, ino, file, cmd, fl);
+}
+
/*
* Compare a requested locking protocol version against the current one.
*
@@ -845,6 +871,7 @@ static struct ocfs2_stack_operations ocfs2_user_plugin_ops = {
.dlm_unlock = user_dlm_unlock,
.lock_status = user_dlm_lock_status,
.lock_lvb = user_dlm_lvb,
+ .plock = user_plock,
.dump_lksb = user_dlm_dump_lksb,
};
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 10e149ae5e3..68b668b0e60 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -97,13 +97,14 @@ static int ocfs2_stack_driver_request(const char *stack_name,
goto out;
}
- /* Ok, the stack is pinned */
- p->sp_count++;
active_stack = p;
-
rc = 0;
out:
+ /* If we found it, pin it */
+ if (!rc)
+ active_stack->sp_count++;
+
spin_unlock(&ocfs2_stack_lock);
return rc;
}
@@ -287,6 +288,26 @@ void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
}
EXPORT_SYMBOL_GPL(ocfs2_dlm_dump_lksb);
+int ocfs2_stack_supports_plocks(void)
+{
+ return active_stack && active_stack->sp_ops->plock;
+}
+EXPORT_SYMBOL_GPL(ocfs2_stack_supports_plocks);
+
+/*
+ * ocfs2_plock() can only be safely called if
+ * ocfs2_stack_supports_plocks() returned true
+ */
+int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
+ struct file *file, int cmd, struct file_lock *fl)
+{
+ WARN_ON_ONCE(active_stack->sp_ops->plock == NULL);
+ if (active_stack->sp_ops->plock)
+ return active_stack->sp_ops->plock(conn, ino, file, cmd, fl);
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL_GPL(ocfs2_plock);
+
int ocfs2_cluster_connect(const char *stack_name,
const char *group,
int grouplen,
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index db56281dd1b..c571af375ef 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -28,6 +28,10 @@
#include "dlm/dlmapi.h"
#include <linux/dlm.h>
+/* Needed for plock-related prototypes */
+struct file;
+struct file_lock;
+
/*
* dlmconstants.h does not have a LOCAL flag. We hope to remove it
* some day, but right now we need it. Let's fake it. This value is larger
@@ -187,6 +191,17 @@ struct ocfs2_stack_operations {
void *(*lock_lvb)(union ocfs2_dlm_lksb *lksb);
/*
+ * Cluster-aware posix locks
+ *
+ * This is NULL for stacks which do not support posix locks.
+ */
+ int (*plock)(struct ocfs2_cluster_connection *conn,
+ u64 ino,
+ struct file *file,
+ int cmd,
+ struct file_lock *fl);
+
+ /*
* This is an optoinal debugging hook. If provided, the
* stack can dump debugging information about this lock.
*/
@@ -240,6 +255,10 @@ int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb);
void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb);
void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb);
+int ocfs2_stack_supports_plocks(void);
+int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
+ struct file *file, int cmd, struct file_lock *fl);
+
void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto);
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index d2d278fb981..c5ff18b46b5 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -62,15 +62,18 @@ static int ocfs2_block_group_fill(handle_t *handle,
struct ocfs2_chain_list *cl);
static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
struct inode *alloc_inode,
- struct buffer_head *bh);
+ struct buffer_head *bh,
+ u64 max_block);
static int ocfs2_cluster_group_search(struct inode *inode,
struct buffer_head *group_bh,
u32 bits_wanted, u32 min_bits,
+ u64 max_block,
u16 *bit_off, u16 *bits_found);
static int ocfs2_block_group_search(struct inode *inode,
struct buffer_head *group_bh,
u32 bits_wanted, u32 min_bits,
+ u64 max_block,
u16 *bit_off, u16 *bits_found);
static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
struct ocfs2_alloc_context *ac,
@@ -110,8 +113,11 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
u64 data_blkno,
u64 *bg_blkno,
u16 *bg_bit_off);
+static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
+ u32 bits_wanted, u64 max_block,
+ struct ocfs2_alloc_context **ac);
-static void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
+void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
{
struct inode *inode = ac->ac_inode;
@@ -124,10 +130,8 @@ static void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
iput(inode);
ac->ac_inode = NULL;
}
- if (ac->ac_bh) {
- brelse(ac->ac_bh);
- ac->ac_bh = NULL;
- }
+ brelse(ac->ac_bh);
+ ac->ac_bh = NULL;
}
void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
@@ -276,7 +280,8 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
*/
static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
struct inode *alloc_inode,
- struct buffer_head *bh)
+ struct buffer_head *bh,
+ u64 max_block)
{
int status, credits;
struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
@@ -294,9 +299,9 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
mlog_entry_void();
cl = &fe->id2.i_chain;
- status = ocfs2_reserve_clusters(osb,
- le16_to_cpu(cl->cl_cpg),
- &ac);
+ status = ocfs2_reserve_clusters_with_limit(osb,
+ le16_to_cpu(cl->cl_cpg),
+ max_block, &ac);
if (status < 0) {
if (status != -ENOSPC)
mlog_errno(status);
@@ -394,8 +399,7 @@ bail:
if (ac)
ocfs2_free_alloc_context(ac);
- if (bg_bh)
- brelse(bg_bh);
+ brelse(bg_bh);
mlog_exit(status);
return status;
@@ -469,7 +473,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
goto bail;
}
- status = ocfs2_block_group_alloc(osb, alloc_inode, bh);
+ status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
+ ac->ac_max_block);
if (status < 0) {
if (status != -ENOSPC)
mlog_errno(status);
@@ -486,16 +491,15 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
get_bh(bh);
ac->ac_bh = bh;
bail:
- if (bh)
- brelse(bh);
+ brelse(bh);
mlog_exit(status);
return status;
}
-int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
- struct ocfs2_dinode *fe,
- struct ocfs2_alloc_context **ac)
+int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
+ int blocks,
+ struct ocfs2_alloc_context **ac)
{
int status;
u32 slot;
@@ -507,7 +511,7 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
goto bail;
}
- (*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe);
+ (*ac)->ac_bits_wanted = blocks;
(*ac)->ac_which = OCFS2_AC_USE_META;
slot = osb->slot_num;
(*ac)->ac_group_search = ocfs2_block_group_search;
@@ -532,6 +536,15 @@ bail:
return status;
}
+int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
+ struct ocfs2_extent_list *root_el,
+ struct ocfs2_alloc_context **ac)
+{
+ return ocfs2_reserve_new_metadata_blocks(osb,
+ ocfs2_extend_meta_needed(root_el),
+ ac);
+}
+
static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
struct ocfs2_alloc_context *ac)
{
@@ -582,6 +595,14 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
(*ac)->ac_group_search = ocfs2_block_group_search;
/*
+ * stat(2) can't handle i_ino > 32bits, so we tell the
+ * lower levels not to allocate us a block group past that
+ * limit. The 'inode64' mount option avoids this behavior.
+ */
+ if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64))
+ (*ac)->ac_max_block = (u32)~0U;
+
+ /*
* slot is set when we successfully steal inode from other nodes.
* It is reset in 3 places:
* 1. when we flush the truncate log
@@ -661,9 +682,9 @@ bail:
/* Callers don't need to care which bitmap (local alloc or main) to
* use so we figure it out for them, but unfortunately this clutters
* things a bit. */
-int ocfs2_reserve_clusters(struct ocfs2_super *osb,
- u32 bits_wanted,
- struct ocfs2_alloc_context **ac)
+static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
+ u32 bits_wanted, u64 max_block,
+ struct ocfs2_alloc_context **ac)
{
int status;
@@ -677,24 +698,20 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
}
(*ac)->ac_bits_wanted = bits_wanted;
+ (*ac)->ac_max_block = max_block;
status = -ENOSPC;
if (ocfs2_alloc_should_use_local(osb, bits_wanted)) {
status = ocfs2_reserve_local_alloc_bits(osb,
bits_wanted,
*ac);
- if ((status < 0) && (status != -ENOSPC)) {
+ if (status == -EFBIG) {
+ /* The local alloc window is outside ac_max_block.
+ * use the main bitmap. */
+ status = -ENOSPC;
+ } else if ((status < 0) && (status != -ENOSPC)) {
mlog_errno(status);
goto bail;
- } else if (status == -ENOSPC) {
- /* reserve_local_bits will return enospc with
- * the local alloc inode still locked, so we
- * can change this safely here. */
- mlog(0, "Disabling local alloc\n");
- /* We set to OCFS2_LA_DISABLED so that umount
- * can clean up what's left of the local
- * allocation */
- osb->local_alloc_state = OCFS2_LA_DISABLED;
}
}
@@ -718,6 +735,13 @@ bail:
return status;
}
+int ocfs2_reserve_clusters(struct ocfs2_super *osb,
+ u32 bits_wanted,
+ struct ocfs2_alloc_context **ac)
+{
+ return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, ac);
+}
+
/*
* More or less lifted from ext3. I'll leave their description below:
*
@@ -1000,11 +1024,14 @@ static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg
static int ocfs2_cluster_group_search(struct inode *inode,
struct buffer_head *group_bh,
u32 bits_wanted, u32 min_bits,
+ u64 max_block,
u16 *bit_off, u16 *bits_found)
{
int search = -ENOSPC;
int ret;
+ u64 blkoff;
struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
u16 tmp_off, tmp_found;
unsigned int max_bits, gd_cluster_off;
@@ -1037,6 +1064,17 @@ static int ocfs2_cluster_group_search(struct inode *inode,
if (ret)
return ret;
+ if (max_block) {
+ blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
+ gd_cluster_off +
+ tmp_off + tmp_found);
+ mlog(0, "Checking %llu against %llu\n",
+ (unsigned long long)blkoff,
+ (unsigned long long)max_block);
+ if (blkoff > max_block)
+ return -ENOSPC;
+ }
+
/* ocfs2_block_group_find_clear_bits() might
* return success, but we still want to return
* -ENOSPC unless it found the minimum number
@@ -1045,6 +1083,12 @@ static int ocfs2_cluster_group_search(struct inode *inode,
*bit_off = tmp_off;
*bits_found = tmp_found;
search = 0; /* success */
+ } else if (tmp_found) {
+ /*
+ * Don't show bits which we'll be returning
+ * for allocation to the local alloc bitmap.
+ */
+ ocfs2_local_alloc_seen_free_bits(osb, tmp_found);
}
}
@@ -1054,19 +1098,31 @@ static int ocfs2_cluster_group_search(struct inode *inode,
static int ocfs2_block_group_search(struct inode *inode,
struct buffer_head *group_bh,
u32 bits_wanted, u32 min_bits,
+ u64 max_block,
u16 *bit_off, u16 *bits_found)
{
int ret = -ENOSPC;
+ u64 blkoff;
struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
BUG_ON(min_bits != 1);
BUG_ON(ocfs2_is_cluster_bitmap(inode));
- if (bg->bg_free_bits_count)
+ if (bg->bg_free_bits_count) {
ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
group_bh, bits_wanted,
le16_to_cpu(bg->bg_bits),
bit_off, bits_found);
+ if (!ret && max_block) {
+ blkoff = le64_to_cpu(bg->bg_blkno) + *bit_off +
+ *bits_found;
+ mlog(0, "Checking %llu against %llu\n",
+ (unsigned long long)blkoff,
+ (unsigned long long)max_block);
+ if (blkoff > max_block)
+ ret = -ENOSPC;
+ }
+ }
return ret;
}
@@ -1116,8 +1172,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
struct ocfs2_group_desc *gd;
struct inode *alloc_inode = ac->ac_inode;
- ret = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), gd_blkno,
- &group_bh, OCFS2_BH_CACHED, alloc_inode);
+ ret = ocfs2_read_block(alloc_inode, gd_blkno, &group_bh);
if (ret < 0) {
mlog_errno(ret);
return ret;
@@ -1131,7 +1186,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
}
ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
- bit_off, &found);
+ ac->ac_max_block, bit_off, &found);
if (ret < 0) {
if (ret != -ENOSPC)
mlog_errno(ret);
@@ -1186,9 +1241,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
bits_wanted, chain,
(unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
- status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
+ status = ocfs2_read_block(alloc_inode,
le64_to_cpu(cl->cl_recs[chain].c_blkno),
- &group_bh, OCFS2_BH_CACHED, alloc_inode);
+ &group_bh);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1204,21 +1259,20 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
/* for now, the chain search is a bit simplistic. We just use
* the 1st group with any empty bits. */
while ((status = ac->ac_group_search(alloc_inode, group_bh,
- bits_wanted, min_bits, bit_off,
+ bits_wanted, min_bits,
+ ac->ac_max_block, bit_off,
&tmp_bits)) == -ENOSPC) {
if (!bg->bg_next_group)
break;
- if (prev_group_bh) {
- brelse(prev_group_bh);
- prev_group_bh = NULL;
- }
+ brelse(prev_group_bh);
+ prev_group_bh = NULL;
+
next_group = le64_to_cpu(bg->bg_next_group);
prev_group_bh = group_bh;
group_bh = NULL;
- status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
- next_group, &group_bh,
- OCFS2_BH_CACHED, alloc_inode);
+ status = ocfs2_read_block(alloc_inode,
+ next_group, &group_bh);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1307,10 +1361,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
*bg_blkno = le64_to_cpu(bg->bg_blkno);
*bits_left = le16_to_cpu(bg->bg_free_bits_count);
bail:
- if (group_bh)
- brelse(group_bh);
- if (prev_group_bh)
- brelse(prev_group_bh);
+ brelse(group_bh);
+ brelse(prev_group_bh);
mlog_exit(status);
return status;
@@ -1723,7 +1775,6 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
{
int status = 0;
u32 tmp_used;
- struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
struct ocfs2_chain_list *cl = &fe->id2.i_chain;
struct buffer_head *group_bh = NULL;
@@ -1742,8 +1793,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
(unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
(unsigned long long)bg_blkno, start_bit);
- status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED,
- alloc_inode);
+ status = ocfs2_read_block(alloc_inode, bg_blkno, &group_bh);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1784,8 +1834,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
}
bail:
- if (group_bh)
- brelse(group_bh);
+ brelse(group_bh);
mlog_exit(status);
return status;
@@ -1838,9 +1887,15 @@ int ocfs2_free_clusters(handle_t *handle,
status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
bg_start_bit, bg_blkno,
num_clusters);
- if (status < 0)
+ if (status < 0) {
mlog_errno(status);
+ goto out;
+ }
+ ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb),
+ num_clusters);
+
+out:
mlog_exit(status);
return status;
}
@@ -1891,3 +1946,84 @@ static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
(unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno);
}
}
+
+/*
+ * For a given allocation, determine which allocators will need to be
+ * accessed, and lock them, reserving the appropriate number of bits.
+ *
+ * Sparse file systems call this from ocfs2_write_begin_nolock()
+ * and ocfs2_allocate_unwritten_extents().
+ *
+ * File systems which don't support holes call this from
+ * ocfs2_extend_allocation().
+ */
+int ocfs2_lock_allocators(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ u32 clusters_to_add, u32 extents_to_split,
+ struct ocfs2_alloc_context **data_ac,
+ struct ocfs2_alloc_context **meta_ac)
+{
+ int ret = 0, num_free_extents;
+ unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ *meta_ac = NULL;
+ if (data_ac)
+ *data_ac = NULL;
+
+ BUG_ON(clusters_to_add != 0 && data_ac == NULL);
+
+ num_free_extents = ocfs2_num_free_extents(osb, inode, et);
+ if (num_free_extents < 0) {
+ ret = num_free_extents;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * Sparse allocation file systems need to be more conservative
+ * with reserving room for expansion - the actual allocation
+ * happens while we've got a journal handle open so re-taking
+ * a cluster lock (because we ran out of room for another
+ * extent) will violate ordering rules.
+ *
+ * Most of the time we'll only be seeing this 1 cluster at a time
+ * anyway.
+ *
+ * Always lock for any unwritten extents - we might want to
+ * add blocks during a split.
+ */
+ if (!num_free_extents ||
+ (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
+ ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac);
+ if (ret < 0) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ if (clusters_to_add == 0)
+ goto out;
+
+ ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
+ if (ret < 0) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+ goto out;
+ }
+
+out:
+ if (ret) {
+ if (*meta_ac) {
+ ocfs2_free_alloc_context(*meta_ac);
+ *meta_ac = NULL;
+ }
+
+ /*
+ * We cannot have an error and a non null *data_ac.
+ */
+ }
+
+ return ret;
+}
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 544c600662b..4df159d8f45 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -28,10 +28,11 @@
typedef int (group_search_t)(struct inode *,
struct buffer_head *,
- u32,
- u32,
- u16 *,
- u16 *);
+ u32, /* bits_wanted */
+ u32, /* min_bits */
+ u64, /* max_block */
+ u16 *, /* *bit_off */
+ u16 *); /* *bits_found */
struct ocfs2_alloc_context {
struct inode *ac_inode; /* which bitmap are we allocating from? */
@@ -51,6 +52,8 @@ struct ocfs2_alloc_context {
group_search_t *ac_group_search;
u64 ac_last_group;
+ u64 ac_max_block; /* Highest block number to allocate. 0 is
+ is the same as ~0 - unlimited */
};
void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac);
@@ -59,9 +62,17 @@ static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac)
return ac->ac_bits_wanted - ac->ac_bits_given;
}
+/*
+ * Please note that the caller must make sure that root_el is the root
+ * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise
+ * the result may be wrong.
+ */
int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
- struct ocfs2_dinode *fe,
+ struct ocfs2_extent_list *root_el,
struct ocfs2_alloc_context **ac);
+int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
+ int blocks,
+ struct ocfs2_alloc_context **ac);
int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
struct ocfs2_alloc_context **ac);
int ocfs2_reserve_clusters(struct ocfs2_super *osb,
@@ -147,6 +158,7 @@ static inline int ocfs2_is_cluster_bitmap(struct inode *inode)
* apis above. */
int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
struct ocfs2_alloc_context *ac);
+void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac);
/* given a cluster offset, calculate which block group it belongs to
* and return that block offset. */
@@ -156,4 +168,8 @@ u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster);
int ocfs2_check_group_descriptor(struct super_block *sb,
struct ocfs2_dinode *di,
struct ocfs2_group_desc *gd);
+int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
+ u32 clusters_to_add, u32 extents_to_split,
+ struct ocfs2_alloc_context **data_ac,
+ struct ocfs2_alloc_context **meta_ac);
#endif /* _CHAINALLOC_H_ */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index df63ba20ae9..304b63ac78c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -64,6 +64,7 @@
#include "sysfile.h"
#include "uptodate.h"
#include "ver.h"
+#include "xattr.h"
#include "buffer_head_io.h"
@@ -154,10 +155,13 @@ enum {
Opt_localalloc,
Opt_localflocks,
Opt_stack,
+ Opt_user_xattr,
+ Opt_nouser_xattr,
+ Opt_inode64,
Opt_err,
};
-static match_table_t tokens = {
+static const match_table_t tokens = {
{Opt_barrier, "barrier=%u"},
{Opt_err_panic, "errors=panic"},
{Opt_err_ro, "errors=remount-ro"},
@@ -173,6 +177,9 @@ static match_table_t tokens = {
{Opt_localalloc, "localalloc=%d"},
{Opt_localflocks, "localflocks"},
{Opt_stack, "cluster_stack=%s"},
+ {Opt_user_xattr, "user_xattr"},
+ {Opt_nouser_xattr, "nouser_xattr"},
+ {Opt_inode64, "inode64"},
{Opt_err, NULL}
};
@@ -205,10 +212,11 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait)
ocfs2_schedule_truncate_log_flush(osb, 0);
}
- if (journal_start_commit(OCFS2_SB(sb)->journal->j_journal, &target)) {
+ if (jbd2_journal_start_commit(OCFS2_SB(sb)->journal->j_journal,
+ &target)) {
if (wait)
- log_wait_commit(OCFS2_SB(sb)->journal->j_journal,
- target);
+ jbd2_log_wait_commit(OCFS2_SB(sb)->journal->j_journal,
+ target);
}
return 0;
}
@@ -325,6 +333,7 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb)
if (!oi)
return NULL;
+ jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode);
return &oi->vfs_inode;
}
@@ -406,6 +415,15 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
goto out;
}
+ /* Probably don't want this on remount; it might
+ * mess with other nodes */
+ if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64) &&
+ (parsed_options.mount_opt & OCFS2_MOUNT_INODE64)) {
+ ret = -EINVAL;
+ mlog(ML_ERROR, "Cannot enable inode64 on remount\n");
+ goto out;
+ }
+
/* We're going to/from readonly mode. */
if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
/* Lock here so the check of HARD_RO and the potential
@@ -637,7 +655,8 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
osb->s_atime_quantum = parsed_options.atime_quantum;
osb->preferred_slot = parsed_options.slot;
osb->osb_commit_interval = parsed_options.commit_interval;
- osb->local_alloc_size = parsed_options.localalloc_opt;
+ osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt);
+ osb->local_alloc_bits = osb->local_alloc_default_bits;
status = ocfs2_verify_userspace_stack(osb, &parsed_options);
if (status)
@@ -743,8 +762,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
return status;
read_super_error:
- if (bh != NULL)
- brelse(bh);
+ brelse(bh);
if (inode)
iput(inode);
@@ -847,6 +865,12 @@ static int ocfs2_parse_options(struct super_block *sb,
case Opt_data_writeback:
mopt->mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK;
break;
+ case Opt_user_xattr:
+ mopt->mount_opt &= ~OCFS2_MOUNT_NOUSERXATTR;
+ break;
+ case Opt_nouser_xattr:
+ mopt->mount_opt |= OCFS2_MOUNT_NOUSERXATTR;
+ break;
case Opt_atime_quantum:
if (match_int(&args[0], &option)) {
status = 0;
@@ -873,7 +897,7 @@ static int ocfs2_parse_options(struct super_block *sb,
if (option < 0)
return 0;
if (option == 0)
- option = JBD_DEFAULT_MAX_COMMIT_AGE;
+ option = JBD2_DEFAULT_MAX_COMMIT_AGE;
mopt->commit_interval = HZ * option;
break;
case Opt_localalloc:
@@ -918,6 +942,9 @@ static int ocfs2_parse_options(struct super_block *sb,
OCFS2_STACK_LABEL_LEN);
mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
break;
+ case Opt_inode64:
+ mopt->mount_opt |= OCFS2_MOUNT_INODE64;
+ break;
default:
mlog(ML_ERROR,
"Unrecognized mount option \"%s\" "
@@ -938,6 +965,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
{
struct ocfs2_super *osb = OCFS2_SB(mnt->mnt_sb);
unsigned long opts = osb->s_mount_opt;
+ unsigned int local_alloc_megs;
if (opts & OCFS2_MOUNT_HB_LOCAL)
seq_printf(s, ",_netdev,heartbeat=local");
@@ -970,8 +998,9 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
seq_printf(s, ",commit=%u",
(unsigned) (osb->osb_commit_interval / HZ));
- if (osb->local_alloc_size != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE)
- seq_printf(s, ",localalloc=%d", osb->local_alloc_size);
+ local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits);
+ if (local_alloc_megs != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE)
+ seq_printf(s, ",localalloc=%d", local_alloc_megs);
if (opts & OCFS2_MOUNT_LOCALFLOCKS)
seq_printf(s, ",localflocks,");
@@ -980,6 +1009,14 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
osb->osb_cluster_stack);
+ if (opts & OCFS2_MOUNT_NOUSERXATTR)
+ seq_printf(s, ",nouser_xattr");
+ else
+ seq_printf(s, ",user_xattr");
+
+ if (opts & OCFS2_MOUNT_INODE64)
+ seq_printf(s, ",inode64");
+
return 0;
}
@@ -1118,7 +1155,7 @@ bail:
return status;
}
-static void ocfs2_inode_init_once(struct kmem_cache *cachep, void *data)
+static void ocfs2_inode_init_once(void *data)
{
struct ocfs2_inode_info *oi = data;
@@ -1132,6 +1169,7 @@ static void ocfs2_inode_init_once(struct kmem_cache *cachep, void *data)
oi->ip_dir_start_lookup = 0;
init_rwsem(&oi->ip_alloc_sem);
+ init_rwsem(&oi->ip_xattr_sem);
mutex_init(&oi->ip_io_mutex);
oi->ip_blkno = 0ULL;
@@ -1375,6 +1413,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
sb->s_fs_info = osb;
sb->s_op = &ocfs2_sops;
sb->s_export_op = &ocfs2_export_ops;
+ sb->s_xattr = ocfs2_xattr_handlers;
sb->s_time_gran = 1;
sb->s_flags |= MS_NOATIME;
/* this is needed to support O_LARGEFILE */
@@ -1421,8 +1460,12 @@ static int ocfs2_initialize_super(struct super_block *sb,
osb->slot_num = OCFS2_INVALID_SLOT;
+ osb->s_xattr_inline_size = le16_to_cpu(
+ di->id2.i_super.s_xattr_inline_size);
+
osb->local_alloc_state = OCFS2_LA_UNUSED;
osb->local_alloc_bh = NULL;
+ INIT_DELAYED_WORK(&osb->la_enable_wq, ocfs2_la_enable_worker);
init_waitqueue_head(&osb->osb_mount_event);
@@ -1442,6 +1485,15 @@ static int ocfs2_initialize_super(struct super_block *sb,
}
mlog(0, "max_slots for this device: %u\n", osb->max_slots);
+ osb->slot_recovery_generations =
+ kcalloc(osb->max_slots, sizeof(*osb->slot_recovery_generations),
+ GFP_KERNEL);
+ if (!osb->slot_recovery_generations) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto bail;
+ }
+
init_waitqueue_head(&osb->osb_wipe_event);
osb->osb_orphan_wipes = kcalloc(osb->max_slots,
sizeof(*osb->osb_orphan_wipes),
@@ -1559,6 +1611,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
osb->first_cluster_group_blkno =
le64_to_cpu(di->id2.i_super.s_first_cluster_group);
osb->fs_generation = le32_to_cpu(di->i_fs_generation);
+ osb->uuid_hash = le32_to_cpu(di->id2.i_super.s_uuid_hash);
mlog(0, "vol_label: %s\n", osb->vol_label);
mlog(0, "uuid: %s\n", osb->uuid_str);
mlog(0, "root_blkno=%llu, system_dir_blkno=%llu\n",
@@ -1703,7 +1756,11 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
local = ocfs2_mount_local(osb);
/* will play back anything left in the journal. */
- ocfs2_journal_load(osb->journal, local);
+ status = ocfs2_journal_load(osb->journal, local, dirty);
+ if (status < 0) {
+ mlog(ML_ERROR, "ocfs2 journal load failed! %d\n", status);
+ goto finally;
+ }
if (dirty) {
/* recover my local alloc if we didn't unmount cleanly. */
@@ -1764,6 +1821,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
ocfs2_free_slot_info(osb);
kfree(osb->osb_orphan_wipes);
+ kfree(osb->slot_recovery_generations);
/* FIXME
* This belongs in journal shutdown, but because we have to
* allocate osb->journal at the start of ocfs2_initalize_osb(),
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index ba9dbb51d25..cbd03dfdc7b 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -50,6 +50,7 @@
#include "inode.h"
#include "journal.h"
#include "symlink.h"
+#include "xattr.h"
#include "buffer_head_io.h"
@@ -83,11 +84,7 @@ static char *ocfs2_fast_symlink_getlink(struct inode *inode,
mlog_entry_void();
- status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
- OCFS2_I(inode)->ip_blkno,
- bh,
- OCFS2_BH_CACHED,
- inode);
+ status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, bh);
if (status < 0) {
mlog_errno(status);
link = ERR_PTR(status);
@@ -157,8 +154,7 @@ bail:
kunmap(page);
page_cache_release(page);
}
- if (bh)
- brelse(bh);
+ brelse(bh);
return ERR_PTR(status);
}
@@ -168,10 +164,18 @@ const struct inode_operations ocfs2_symlink_inode_operations = {
.follow_link = ocfs2_follow_link,
.getattr = ocfs2_getattr,
.setattr = ocfs2_setattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = ocfs2_listxattr,
+ .removexattr = generic_removexattr,
};
const struct inode_operations ocfs2_fast_symlink_inode_operations = {
.readlink = ocfs2_readlink,
.follow_link = ocfs2_follow_link,
.getattr = ocfs2_getattr,
.setattr = ocfs2_setattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .listxattr = ocfs2_listxattr,
+ .removexattr = generic_removexattr,
};
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index 4da8851f2b2..187b99ff036 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -53,7 +53,11 @@
#include <linux/highmem.h>
#include <linux/buffer_head.h>
#include <linux/rbtree.h>
-#include <linux/jbd.h>
+#ifndef CONFIG_OCFS2_COMPAT_JBD
+# include <linux/jbd2.h>
+#else
+# include <linux/jbd.h>
+#endif
#define MLOG_MASK_PREFIX ML_UPTODATE
@@ -511,14 +515,10 @@ static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci,
ci->ci_num_cached--;
}
-/* Called when we remove a chunk of metadata from an inode. We don't
- * bother reverting things to an inlined array in the case of a remove
- * which moves us back under the limit. */
-void ocfs2_remove_from_cache(struct inode *inode,
- struct buffer_head *bh)
+static void ocfs2_remove_block_from_cache(struct inode *inode,
+ sector_t block)
{
int index;
- sector_t block = bh->b_blocknr;
struct ocfs2_meta_cache_item *item = NULL;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_caching_info *ci = &oi->ip_metadata_cache;
@@ -544,6 +544,30 @@ void ocfs2_remove_from_cache(struct inode *inode,
kmem_cache_free(ocfs2_uptodate_cachep, item);
}
+/*
+ * Called when we remove a chunk of metadata from an inode. We don't
+ * bother reverting things to an inlined array in the case of a remove
+ * which moves us back under the limit.
+ */
+void ocfs2_remove_from_cache(struct inode *inode,
+ struct buffer_head *bh)
+{
+ sector_t block = bh->b_blocknr;
+
+ ocfs2_remove_block_from_cache(inode, block);
+}
+
+/* Called when we remove xattr clusters from an inode. */
+void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode,
+ sector_t block,
+ u32 c_len)
+{
+ unsigned int i, b_len = ocfs2_clusters_to_blocks(inode->i_sb, 1) * c_len;
+
+ for (i = 0; i < b_len; i++, block++)
+ ocfs2_remove_block_from_cache(inode, block);
+}
+
int __init init_ocfs2_uptodate_cache(void)
{
ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate",
diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h
index 2e73206059a..531b4b3a0c4 100644
--- a/fs/ocfs2/uptodate.h
+++ b/fs/ocfs2/uptodate.h
@@ -40,6 +40,9 @@ void ocfs2_set_new_buffer_uptodate(struct inode *inode,
struct buffer_head *bh);
void ocfs2_remove_from_cache(struct inode *inode,
struct buffer_head *bh);
+void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode,
+ sector_t block,
+ u32 c_len);
int ocfs2_buffer_read_ahead(struct inode *inode,
struct buffer_head *bh);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
new file mode 100644
index 00000000000..c25780a70df
--- /dev/null
+++ b/fs/ocfs2/xattr.c
@@ -0,0 +1,4834 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * xattr.c
+ *
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * CREDITS:
+ * Lots of code in this file is taken from ext3.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/uio.h>
+#include <linux/sched.h>
+#include <linux/splice.h>
+#include <linux/mount.h>
+#include <linux/writeback.h>
+#include <linux/falloc.h>
+#include <linux/sort.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#define MLOG_MASK_PREFIX ML_XATTR
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "alloc.h"
+#include "dlmglue.h"
+#include "file.h"
+#include "symlink.h"
+#include "sysfile.h"
+#include "inode.h"
+#include "journal.h"
+#include "ocfs2_fs.h"
+#include "suballoc.h"
+#include "uptodate.h"
+#include "buffer_head_io.h"
+#include "super.h"
+#include "xattr.h"
+
+
+struct ocfs2_xattr_def_value_root {
+ struct ocfs2_xattr_value_root xv;
+ struct ocfs2_extent_rec er;
+};
+
+struct ocfs2_xattr_bucket {
+ struct buffer_head *bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET];
+ struct ocfs2_xattr_header *xh;
+};
+
+#define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root))
+#define OCFS2_XATTR_INLINE_SIZE 80
+
+static struct ocfs2_xattr_def_value_root def_xv = {
+ .xv.xr_list.l_count = cpu_to_le16(1),
+};
+
+struct xattr_handler *ocfs2_xattr_handlers[] = {
+ &ocfs2_xattr_user_handler,
+ &ocfs2_xattr_trusted_handler,
+ NULL
+};
+
+static struct xattr_handler *ocfs2_xattr_handler_map[] = {
+ [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler,
+ [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler,
+};
+
+struct ocfs2_xattr_info {
+ int name_index;
+ const char *name;
+ const void *value;
+ size_t value_len;
+};
+
+struct ocfs2_xattr_search {
+ struct buffer_head *inode_bh;
+ /*
+ * xattr_bh point to the block buffer head which has extended attribute
+ * when extended attribute in inode, xattr_bh is equal to inode_bh.
+ */
+ struct buffer_head *xattr_bh;
+ struct ocfs2_xattr_header *header;
+ struct ocfs2_xattr_bucket bucket;
+ void *base;
+ void *end;
+ struct ocfs2_xattr_entry *here;
+ int not_found;
+};
+
+static int ocfs2_xattr_bucket_get_name_value(struct inode *inode,
+ struct ocfs2_xattr_header *xh,
+ int index,
+ int *block_off,
+ int *new_offset);
+
+static int ocfs2_xattr_index_block_find(struct inode *inode,
+ struct buffer_head *root_bh,
+ int name_index,
+ const char *name,
+ struct ocfs2_xattr_search *xs);
+
+static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
+ struct ocfs2_xattr_tree_root *xt,
+ char *buffer,
+ size_t buffer_size);
+
+static int ocfs2_xattr_create_index_block(struct inode *inode,
+ struct ocfs2_xattr_search *xs);
+
+static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xs);
+
+static int ocfs2_delete_xattr_index_block(struct inode *inode,
+ struct buffer_head *xb_bh);
+
+static inline const char *ocfs2_xattr_prefix(int name_index)
+{
+ struct xattr_handler *handler = NULL;
+
+ if (name_index > 0 && name_index < OCFS2_XATTR_MAX)
+ handler = ocfs2_xattr_handler_map[name_index];
+
+ return handler ? handler->prefix : NULL;
+}
+
+static u32 ocfs2_xattr_name_hash(struct inode *inode,
+ const char *name,
+ int name_len)
+{
+ /* Get hash value of uuid from super block */
+ u32 hash = OCFS2_SB(inode->i_sb)->uuid_hash;
+ int i;
+
+ /* hash extended attribute name */
+ for (i = 0; i < name_len; i++) {
+ hash = (hash << OCFS2_HASH_SHIFT) ^
+ (hash >> (8*sizeof(hash) - OCFS2_HASH_SHIFT)) ^
+ *name++;
+ }
+
+ return hash;
+}
+
+/*
+ * ocfs2_xattr_hash_entry()
+ *
+ * Compute the hash of an extended attribute.
+ */
+static void ocfs2_xattr_hash_entry(struct inode *inode,
+ struct ocfs2_xattr_header *header,
+ struct ocfs2_xattr_entry *entry)
+{
+ u32 hash = 0;
+ char *name = (char *)header + le16_to_cpu(entry->xe_name_offset);
+
+ hash = ocfs2_xattr_name_hash(inode, name, entry->xe_name_len);
+ entry->xe_name_hash = cpu_to_le32(hash);
+
+ return;
+}
+
+static int ocfs2_xattr_extend_allocation(struct inode *inode,
+ u32 clusters_to_add,
+ struct buffer_head *xattr_bh,
+ struct ocfs2_xattr_value_root *xv)
+{
+ int status = 0;
+ int restart_func = 0;
+ int credits = 0;
+ handle_t *handle = NULL;
+ struct ocfs2_alloc_context *data_ac = NULL;
+ struct ocfs2_alloc_context *meta_ac = NULL;
+ enum ocfs2_alloc_restarted why;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ u32 prev_clusters, logical_start = le32_to_cpu(xv->xr_clusters);
+ struct ocfs2_extent_tree et;
+
+ mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add);
+
+ ocfs2_init_xattr_value_extent_tree(&et, inode, xattr_bh, xv);
+
+restart_all:
+
+ status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
+ &data_ac, &meta_ac);
+ if (status) {
+ mlog_errno(status);
+ goto leave;
+ }
+
+ credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el,
+ clusters_to_add);
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ handle = NULL;
+ mlog_errno(status);
+ goto leave;
+ }
+
+restarted_transaction:
+ status = ocfs2_journal_access(handle, inode, xattr_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
+
+ prev_clusters = le32_to_cpu(xv->xr_clusters);
+ status = ocfs2_add_clusters_in_btree(osb,
+ inode,
+ &logical_start,
+ clusters_to_add,
+ 0,
+ &et,
+ handle,
+ data_ac,
+ meta_ac,
+ &why);
+ if ((status < 0) && (status != -EAGAIN)) {
+ if (status != -ENOSPC)
+ mlog_errno(status);
+ goto leave;
+ }
+
+ status = ocfs2_journal_dirty(handle, xattr_bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
+
+ clusters_to_add -= le32_to_cpu(xv->xr_clusters) - prev_clusters;
+
+ if (why != RESTART_NONE && clusters_to_add) {
+ if (why == RESTART_META) {
+ mlog(0, "restarting function.\n");
+ restart_func = 1;
+ } else {
+ BUG_ON(why != RESTART_TRANS);
+
+ mlog(0, "restarting transaction.\n");
+ /* TODO: This can be more intelligent. */
+ credits = ocfs2_calc_extend_credits(osb->sb,
+ et.et_root_el,
+ clusters_to_add);
+ status = ocfs2_extend_trans(handle, credits);
+ if (status < 0) {
+ /* handle still has to be committed at
+ * this point. */
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto leave;
+ }
+ goto restarted_transaction;
+ }
+ }
+
+leave:
+ if (handle) {
+ ocfs2_commit_trans(osb, handle);
+ handle = NULL;
+ }
+ if (data_ac) {
+ ocfs2_free_alloc_context(data_ac);
+ data_ac = NULL;
+ }
+ if (meta_ac) {
+ ocfs2_free_alloc_context(meta_ac);
+ meta_ac = NULL;
+ }
+ if ((!status) && restart_func) {
+ restart_func = 0;
+ goto restart_all;
+ }
+
+ return status;
+}
+
+static int __ocfs2_remove_xattr_range(struct inode *inode,
+ struct buffer_head *root_bh,
+ struct ocfs2_xattr_value_root *xv,
+ u32 cpos, u32 phys_cpos, u32 len,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret;
+ u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct inode *tl_inode = osb->osb_tl_inode;
+ handle_t *handle;
+ struct ocfs2_alloc_context *meta_ac = NULL;
+ struct ocfs2_extent_tree et;
+
+ ocfs2_init_xattr_value_extent_tree(&et, inode, root_bh, xv);
+
+ ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ mutex_lock(&tl_inode->i_mutex);
+
+ if (ocfs2_truncate_log_needs_flush(osb)) {
+ ret = __ocfs2_flush_truncate_log(osb);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access(handle, inode, root_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
+ dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ le32_add_cpu(&xv->xr_clusters, -len);
+
+ ret = ocfs2_journal_dirty(handle, root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
+ if (ret)
+ mlog_errno(ret);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+out:
+ mutex_unlock(&tl_inode->i_mutex);
+
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+
+ return ret;
+}
+
+static int ocfs2_xattr_shrink_size(struct inode *inode,
+ u32 old_clusters,
+ u32 new_clusters,
+ struct buffer_head *root_bh,
+ struct ocfs2_xattr_value_root *xv)
+{
+ int ret = 0;
+ u32 trunc_len, cpos, phys_cpos, alloc_size;
+ u64 block;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_cached_dealloc_ctxt dealloc;
+
+ ocfs2_init_dealloc_ctxt(&dealloc);
+
+ if (old_clusters <= new_clusters)
+ return 0;
+
+ cpos = new_clusters;
+ trunc_len = old_clusters - new_clusters;
+ while (trunc_len) {
+ ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos,
+ &alloc_size, &xv->xr_list);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (alloc_size > trunc_len)
+ alloc_size = trunc_len;
+
+ ret = __ocfs2_remove_xattr_range(inode, root_bh, xv, cpos,
+ phys_cpos, alloc_size,
+ &dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ block = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+ ocfs2_remove_xattr_clusters_from_cache(inode, block,
+ alloc_size);
+ cpos += alloc_size;
+ trunc_len -= alloc_size;
+ }
+
+out:
+ ocfs2_schedule_truncate_log_flush(osb, 1);
+ ocfs2_run_deallocs(osb, &dealloc);
+
+ return ret;
+}
+
+static int ocfs2_xattr_value_truncate(struct inode *inode,
+ struct buffer_head *root_bh,
+ struct ocfs2_xattr_value_root *xv,
+ int len)
+{
+ int ret;
+ u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, len);
+ u32 old_clusters = le32_to_cpu(xv->xr_clusters);
+
+ if (new_clusters == old_clusters)
+ return 0;
+
+ if (new_clusters > old_clusters)
+ ret = ocfs2_xattr_extend_allocation(inode,
+ new_clusters - old_clusters,
+ root_bh, xv);
+ else
+ ret = ocfs2_xattr_shrink_size(inode,
+ old_clusters, new_clusters,
+ root_bh, xv);
+
+ return ret;
+}
+
+static int ocfs2_xattr_list_entry(char *buffer, size_t size,
+ size_t *result, const char *prefix,
+ const char *name, int name_len)
+{
+ char *p = buffer + *result;
+ int prefix_len = strlen(prefix);
+ int total_len = prefix_len + name_len + 1;
+
+ *result += total_len;
+
+ /* we are just looking for how big our buffer needs to be */
+ if (!size)
+ return 0;
+
+ if (*result > size)
+ return -ERANGE;
+
+ memcpy(p, prefix, prefix_len);
+ memcpy(p + prefix_len, name, name_len);
+ p[prefix_len + name_len] = '\0';
+
+ return 0;
+}
+
+static int ocfs2_xattr_list_entries(struct inode *inode,
+ struct ocfs2_xattr_header *header,
+ char *buffer, size_t buffer_size)
+{
+ size_t result = 0;
+ int i, type, ret;
+ const char *prefix, *name;
+
+ for (i = 0 ; i < le16_to_cpu(header->xh_count); i++) {
+ struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
+ type = ocfs2_xattr_get_type(entry);
+ prefix = ocfs2_xattr_prefix(type);
+
+ if (prefix) {
+ name = (const char *)header +
+ le16_to_cpu(entry->xe_name_offset);
+
+ ret = ocfs2_xattr_list_entry(buffer, buffer_size,
+ &result, prefix, name,
+ entry->xe_name_len);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return result;
+}
+
+static int ocfs2_xattr_ibody_list(struct inode *inode,
+ struct ocfs2_dinode *di,
+ char *buffer,
+ size_t buffer_size)
+{
+ struct ocfs2_xattr_header *header = NULL;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ int ret = 0;
+
+ if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL))
+ return ret;
+
+ header = (struct ocfs2_xattr_header *)
+ ((void *)di + inode->i_sb->s_blocksize -
+ le16_to_cpu(di->i_xattr_inline_size));
+
+ ret = ocfs2_xattr_list_entries(inode, header, buffer, buffer_size);
+
+ return ret;
+}
+
+static int ocfs2_xattr_block_list(struct inode *inode,
+ struct ocfs2_dinode *di,
+ char *buffer,
+ size_t buffer_size)
+{
+ struct buffer_head *blk_bh = NULL;
+ struct ocfs2_xattr_block *xb;
+ int ret = 0;
+
+ if (!di->i_xattr_loc)
+ return ret;
+
+ ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+ /*Verify the signature of xattr block*/
+ if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
+ strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
+ ret = -EFAULT;
+ goto cleanup;
+ }
+
+ xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+
+ if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+ struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
+ ret = ocfs2_xattr_list_entries(inode, header,
+ buffer, buffer_size);
+ } else {
+ struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
+ ret = ocfs2_xattr_tree_list_index_block(inode, xt,
+ buffer, buffer_size);
+ }
+cleanup:
+ brelse(blk_bh);
+
+ return ret;
+}
+
+ssize_t ocfs2_listxattr(struct dentry *dentry,
+ char *buffer,
+ size_t size)
+{
+ int ret = 0, i_ret = 0, b_ret = 0;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_dinode *di = NULL;
+ struct ocfs2_inode_info *oi = OCFS2_I(dentry->d_inode);
+
+ if (!ocfs2_supports_xattr(OCFS2_SB(dentry->d_sb)))
+ return -EOPNOTSUPP;
+
+ if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
+ return ret;
+
+ ret = ocfs2_inode_lock(dentry->d_inode, &di_bh, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+
+ down_read(&oi->ip_xattr_sem);
+ i_ret = ocfs2_xattr_ibody_list(dentry->d_inode, di, buffer, size);
+ if (i_ret < 0)
+ b_ret = 0;
+ else {
+ if (buffer) {
+ buffer += i_ret;
+ size -= i_ret;
+ }
+ b_ret = ocfs2_xattr_block_list(dentry->d_inode, di,
+ buffer, size);
+ if (b_ret < 0)
+ i_ret = 0;
+ }
+ up_read(&oi->ip_xattr_sem);
+ ocfs2_inode_unlock(dentry->d_inode, 0);
+
+ brelse(di_bh);
+
+ return i_ret + b_ret;
+}
+
+static int ocfs2_xattr_find_entry(int name_index,
+ const char *name,
+ struct ocfs2_xattr_search *xs)
+{
+ struct ocfs2_xattr_entry *entry;
+ size_t name_len;
+ int i, cmp = 1;
+
+ if (name == NULL)
+ return -EINVAL;
+
+ name_len = strlen(name);
+ entry = xs->here;
+ for (i = 0; i < le16_to_cpu(xs->header->xh_count); i++) {
+ cmp = name_index - ocfs2_xattr_get_type(entry);
+ if (!cmp)
+ cmp = name_len - entry->xe_name_len;
+ if (!cmp)
+ cmp = memcmp(name, (xs->base +
+ le16_to_cpu(entry->xe_name_offset)),
+ name_len);
+ if (cmp == 0)
+ break;
+ entry += 1;
+ }
+ xs->here = entry;
+
+ return cmp ? -ENODATA : 0;
+}
+
+static int ocfs2_xattr_get_value_outside(struct inode *inode,
+ struct ocfs2_xattr_value_root *xv,
+ void *buffer,
+ size_t len)
+{
+ u32 cpos, p_cluster, num_clusters, bpc, clusters;
+ u64 blkno;
+ int i, ret = 0;
+ size_t cplen, blocksize;
+ struct buffer_head *bh = NULL;
+ struct ocfs2_extent_list *el;
+
+ el = &xv->xr_list;
+ clusters = le32_to_cpu(xv->xr_clusters);
+ bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+ blocksize = inode->i_sb->s_blocksize;
+
+ cpos = 0;
+ while (cpos < clusters) {
+ ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
+ &num_clusters, el);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
+ /* Copy ocfs2_xattr_value */
+ for (i = 0; i < num_clusters * bpc; i++, blkno++) {
+ ret = ocfs2_read_block(inode, blkno, &bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ cplen = len >= blocksize ? blocksize : len;
+ memcpy(buffer, bh->b_data, cplen);
+ len -= cplen;
+ buffer += cplen;
+
+ brelse(bh);
+ bh = NULL;
+ if (len == 0)
+ break;
+ }
+ cpos += num_clusters;
+ }
+out:
+ return ret;
+}
+
+static int ocfs2_xattr_ibody_get(struct inode *inode,
+ int name_index,
+ const char *name,
+ void *buffer,
+ size_t buffer_size,
+ struct ocfs2_xattr_search *xs)
+{
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+ struct ocfs2_xattr_value_root *xv;
+ size_t size;
+ int ret = 0;
+
+ if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL))
+ return -ENODATA;
+
+ xs->end = (void *)di + inode->i_sb->s_blocksize;
+ xs->header = (struct ocfs2_xattr_header *)
+ (xs->end - le16_to_cpu(di->i_xattr_inline_size));
+ xs->base = (void *)xs->header;
+ xs->here = xs->header->xh_entries;
+
+ ret = ocfs2_xattr_find_entry(name_index, name, xs);
+ if (ret)
+ return ret;
+ size = le64_to_cpu(xs->here->xe_value_size);
+ if (buffer) {
+ if (size > buffer_size)
+ return -ERANGE;
+ if (ocfs2_xattr_is_local(xs->here)) {
+ memcpy(buffer, (void *)xs->base +
+ le16_to_cpu(xs->here->xe_name_offset) +
+ OCFS2_XATTR_SIZE(xs->here->xe_name_len), size);
+ } else {
+ xv = (struct ocfs2_xattr_value_root *)
+ (xs->base + le16_to_cpu(
+ xs->here->xe_name_offset) +
+ OCFS2_XATTR_SIZE(xs->here->xe_name_len));
+ ret = ocfs2_xattr_get_value_outside(inode, xv,
+ buffer, size);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+ }
+ }
+
+ return size;
+}
+
+static int ocfs2_xattr_block_get(struct inode *inode,
+ int name_index,
+ const char *name,
+ void *buffer,
+ size_t buffer_size,
+ struct ocfs2_xattr_search *xs)
+{
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+ struct buffer_head *blk_bh = NULL;
+ struct ocfs2_xattr_block *xb;
+ struct ocfs2_xattr_value_root *xv;
+ size_t size;
+ int ret = -ENODATA, name_offset, name_len, block_off, i;
+
+ if (!di->i_xattr_loc)
+ return ret;
+
+ memset(&xs->bucket, 0, sizeof(xs->bucket));
+
+ ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+ /*Verify the signature of xattr block*/
+ if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
+ strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
+ ret = -EFAULT;
+ goto cleanup;
+ }
+
+ xs->xattr_bh = blk_bh;
+ xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+
+ if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+ xs->header = &xb->xb_attrs.xb_header;
+ xs->base = (void *)xs->header;
+ xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size;
+ xs->here = xs->header->xh_entries;
+
+ ret = ocfs2_xattr_find_entry(name_index, name, xs);
+ } else
+ ret = ocfs2_xattr_index_block_find(inode, blk_bh,
+ name_index,
+ name, xs);
+
+ if (ret)
+ goto cleanup;
+ size = le64_to_cpu(xs->here->xe_value_size);
+ if (buffer) {
+ ret = -ERANGE;
+ if (size > buffer_size)
+ goto cleanup;
+
+ name_offset = le16_to_cpu(xs->here->xe_name_offset);
+ name_len = OCFS2_XATTR_SIZE(xs->here->xe_name_len);
+ i = xs->here - xs->header->xh_entries;
+
+ if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
+ ret = ocfs2_xattr_bucket_get_name_value(inode,
+ xs->bucket.xh,
+ i,
+ &block_off,
+ &name_offset);
+ xs->base = xs->bucket.bhs[block_off]->b_data;
+ }
+ if (ocfs2_xattr_is_local(xs->here)) {
+ memcpy(buffer, (void *)xs->base +
+ name_offset + name_len, size);
+ } else {
+ xv = (struct ocfs2_xattr_value_root *)
+ (xs->base + name_offset + name_len);
+ ret = ocfs2_xattr_get_value_outside(inode, xv,
+ buffer, size);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto cleanup;
+ }
+ }
+ }
+ ret = size;
+cleanup:
+ for (i = 0; i < OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET; i++)
+ brelse(xs->bucket.bhs[i]);
+ memset(&xs->bucket, 0, sizeof(xs->bucket));
+
+ brelse(blk_bh);
+ return ret;
+}
+
+/* ocfs2_xattr_get()
+ *
+ * Copy an extended attribute into the buffer provided.
+ * Buffer is NULL to compute the size of buffer required.
+ */
+int ocfs2_xattr_get(struct inode *inode,
+ int name_index,
+ const char *name,
+ void *buffer,
+ size_t buffer_size)
+{
+ int ret;
+ struct ocfs2_dinode *di = NULL;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_xattr_search xis = {
+ .not_found = -ENODATA,
+ };
+ struct ocfs2_xattr_search xbs = {
+ .not_found = -ENODATA,
+ };
+
+ if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+ return -EOPNOTSUPP;
+
+ if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
+ ret = -ENODATA;
+
+ ret = ocfs2_inode_lock(inode, &di_bh, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+ xis.inode_bh = xbs.inode_bh = di_bh;
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+
+ down_read(&oi->ip_xattr_sem);
+ ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer,
+ buffer_size, &xis);
+ if (ret == -ENODATA)
+ ret = ocfs2_xattr_block_get(inode, name_index, name, buffer,
+ buffer_size, &xbs);
+ up_read(&oi->ip_xattr_sem);
+ ocfs2_inode_unlock(inode, 0);
+
+ brelse(di_bh);
+
+ return ret;
+}
+
+static int __ocfs2_xattr_set_value_outside(struct inode *inode,
+ struct ocfs2_xattr_value_root *xv,
+ const void *value,
+ int value_len)
+{
+ int ret = 0, i, cp_len, credits;
+ u16 blocksize = inode->i_sb->s_blocksize;
+ u32 p_cluster, num_clusters;
+ u32 cpos = 0, bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+ u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len);
+ u64 blkno;
+ struct buffer_head *bh = NULL;
+ handle_t *handle;
+
+ BUG_ON(clusters > le32_to_cpu(xv->xr_clusters));
+
+ credits = clusters * bpc;
+ handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb), credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ while (cpos < clusters) {
+ ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
+ &num_clusters, &xv->xr_list);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
+
+ for (i = 0; i < num_clusters * bpc; i++, blkno++) {
+ ret = ocfs2_read_block(inode, blkno, &bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_journal_access(handle,
+ inode,
+ bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ cp_len = value_len > blocksize ? blocksize : value_len;
+ memcpy(bh->b_data, value, cp_len);
+ value_len -= cp_len;
+ value += cp_len;
+ if (cp_len < blocksize)
+ memset(bh->b_data + cp_len, 0,
+ blocksize - cp_len);
+
+ ret = ocfs2_journal_dirty(handle, bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+ brelse(bh);
+ bh = NULL;
+
+ /*
+ * XXX: do we need to empty all the following
+ * blocks in this cluster?
+ */
+ if (!value_len)
+ break;
+ }
+ cpos += num_clusters;
+ }
+out_commit:
+ ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+ brelse(bh);
+
+ return ret;
+}
+
+static int ocfs2_xattr_cleanup(struct inode *inode,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xs,
+ size_t offs)
+{
+ handle_t *handle = NULL;
+ int ret = 0;
+ size_t name_len = strlen(xi->name);
+ void *val = xs->base + offs;
+ size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
+
+ handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+ OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+ ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+ /* Decrease xattr count */
+ le16_add_cpu(&xs->header->xh_count, -1);
+ /* Remove the xattr entry and tree root which has already be set*/
+ memset((void *)xs->here, 0, sizeof(struct ocfs2_xattr_entry));
+ memset(val, 0, size);
+
+ ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
+ if (ret < 0)
+ mlog_errno(ret);
+out_commit:
+ ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+ return ret;
+}
+
+static int ocfs2_xattr_update_entry(struct inode *inode,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xs,
+ size_t offs)
+{
+ handle_t *handle = NULL;
+ int ret = 0;
+
+ handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+ OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+ ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ xs->here->xe_name_offset = cpu_to_le16(offs);
+ xs->here->xe_value_size = cpu_to_le64(xi->value_len);
+ if (xi->value_len <= OCFS2_XATTR_INLINE_SIZE)
+ ocfs2_xattr_set_local(xs->here, 1);
+ else
+ ocfs2_xattr_set_local(xs->here, 0);
+ ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
+
+ ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
+ if (ret < 0)
+ mlog_errno(ret);
+out_commit:
+ ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+ return ret;
+}
+
+/*
+ * ocfs2_xattr_set_value_outside()
+ *
+ * Set large size value in B tree.
+ */
+static int ocfs2_xattr_set_value_outside(struct inode *inode,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xs,
+ size_t offs)
+{
+ size_t name_len = strlen(xi->name);
+ void *val = xs->base + offs;
+ struct ocfs2_xattr_value_root *xv = NULL;
+ size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
+ int ret = 0;
+
+ memset(val, 0, size);
+ memcpy(val, xi->name, name_len);
+ xv = (struct ocfs2_xattr_value_root *)
+ (val + OCFS2_XATTR_SIZE(name_len));
+ xv->xr_clusters = 0;
+ xv->xr_last_eb_blk = 0;
+ xv->xr_list.l_tree_depth = 0;
+ xv->xr_list.l_count = cpu_to_le16(1);
+ xv->xr_list.l_next_free_rec = 0;
+
+ ret = ocfs2_xattr_value_truncate(inode, xs->xattr_bh, xv,
+ xi->value_len);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+ ret = __ocfs2_xattr_set_value_outside(inode, xv, xi->value,
+ xi->value_len);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+ ret = ocfs2_xattr_update_entry(inode, xi, xs, offs);
+ if (ret < 0)
+ mlog_errno(ret);
+
+ return ret;
+}
+
+/*
+ * ocfs2_xattr_set_entry_local()
+ *
+ * Set, replace or remove extended attribute in local.
+ */
+static void ocfs2_xattr_set_entry_local(struct inode *inode,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xs,
+ struct ocfs2_xattr_entry *last,
+ size_t min_offs)
+{
+ size_t name_len = strlen(xi->name);
+ int i;
+
+ if (xi->value && xs->not_found) {
+ /* Insert the new xattr entry. */
+ le16_add_cpu(&xs->header->xh_count, 1);
+ ocfs2_xattr_set_type(last, xi->name_index);
+ ocfs2_xattr_set_local(last, 1);
+ last->xe_name_len = name_len;
+ } else {
+ void *first_val;
+ void *val;
+ size_t offs, size;
+
+ first_val = xs->base + min_offs;
+ offs = le16_to_cpu(xs->here->xe_name_offset);
+ val = xs->base + offs;
+
+ if (le64_to_cpu(xs->here->xe_value_size) >
+ OCFS2_XATTR_INLINE_SIZE)
+ size = OCFS2_XATTR_SIZE(name_len) +
+ OCFS2_XATTR_ROOT_SIZE;
+ else
+ size = OCFS2_XATTR_SIZE(name_len) +
+ OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
+
+ if (xi->value && size == OCFS2_XATTR_SIZE(name_len) +
+ OCFS2_XATTR_SIZE(xi->value_len)) {
+ /* The old and the new value have the
+ same size. Just replace the value. */
+ ocfs2_xattr_set_local(xs->here, 1);
+ xs->here->xe_value_size = cpu_to_le64(xi->value_len);
+ /* Clear value bytes. */
+ memset(val + OCFS2_XATTR_SIZE(name_len),
+ 0,
+ OCFS2_XATTR_SIZE(xi->value_len));
+ memcpy(val + OCFS2_XATTR_SIZE(name_len),
+ xi->value,
+ xi->value_len);
+ return;
+ }
+ /* Remove the old name+value. */
+ memmove(first_val + size, first_val, val - first_val);
+ memset(first_val, 0, size);
+ xs->here->xe_name_hash = 0;
+ xs->here->xe_name_offset = 0;
+ ocfs2_xattr_set_local(xs->here, 1);
+ xs->here->xe_value_size = 0;
+
+ min_offs += size;
+
+ /* Adjust all value offsets. */
+ last = xs->header->xh_entries;
+ for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) {
+ size_t o = le16_to_cpu(last->xe_name_offset);
+
+ if (o < offs)
+ last->xe_name_offset = cpu_to_le16(o + size);
+ last += 1;
+ }
+
+ if (!xi->value) {
+ /* Remove the old entry. */
+ last -= 1;
+ memmove(xs->here, xs->here + 1,
+ (void *)last - (void *)xs->here);
+ memset(last, 0, sizeof(struct ocfs2_xattr_entry));
+ le16_add_cpu(&xs->header->xh_count, -1);
+ }
+ }
+ if (xi->value) {
+ /* Insert the new name+value. */
+ size_t size = OCFS2_XATTR_SIZE(name_len) +
+ OCFS2_XATTR_SIZE(xi->value_len);
+ void *val = xs->base + min_offs - size;
+
+ xs->here->xe_name_offset = cpu_to_le16(min_offs - size);
+ memset(val, 0, size);
+ memcpy(val, xi->name, name_len);
+ memcpy(val + OCFS2_XATTR_SIZE(name_len),
+ xi->value,
+ xi->value_len);
+ xs->here->xe_value_size = cpu_to_le64(xi->value_len);
+ ocfs2_xattr_set_local(xs->here, 1);
+ ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
+ }
+
+ return;
+}
+
+/*
+ * ocfs2_xattr_set_entry()
+ *
+ * Set extended attribute entry into inode or block.
+ *
+ * If extended attribute value size > OCFS2_XATTR_INLINE_SIZE,
+ * We first insert tree root(ocfs2_xattr_value_root) with set_entry_local(),
+ * then set value in B tree with set_value_outside().
+ */
+static int ocfs2_xattr_set_entry(struct inode *inode,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xs,
+ int flag)
+{
+ struct ocfs2_xattr_entry *last;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+ size_t min_offs = xs->end - xs->base, name_len = strlen(xi->name);
+ size_t size_l = 0;
+ handle_t *handle = NULL;
+ int free, i, ret;
+ struct ocfs2_xattr_info xi_l = {
+ .name_index = xi->name_index,
+ .name = xi->name,
+ .value = xi->value,
+ .value_len = xi->value_len,
+ };
+
+ /* Compute min_offs, last and free space. */
+ last = xs->header->xh_entries;
+
+ for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) {
+ size_t offs = le16_to_cpu(last->xe_name_offset);
+ if (offs < min_offs)
+ min_offs = offs;
+ last += 1;
+ }
+
+ free = min_offs - ((void *)last - xs->base) - sizeof(__u32);
+ if (free < 0)
+ return -EFAULT;
+
+ if (!xs->not_found) {
+ size_t size = 0;
+ if (ocfs2_xattr_is_local(xs->here))
+ size = OCFS2_XATTR_SIZE(name_len) +
+ OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
+ else
+ size = OCFS2_XATTR_SIZE(name_len) +
+ OCFS2_XATTR_ROOT_SIZE;
+ free += (size + sizeof(struct ocfs2_xattr_entry));
+ }
+ /* Check free space in inode or block */
+ if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+ if (free < sizeof(struct ocfs2_xattr_entry) +
+ OCFS2_XATTR_SIZE(name_len) +
+ OCFS2_XATTR_ROOT_SIZE) {
+ ret = -ENOSPC;
+ goto out;
+ }
+ size_l = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
+ xi_l.value = (void *)&def_xv;
+ xi_l.value_len = OCFS2_XATTR_ROOT_SIZE;
+ } else if (xi->value) {
+ if (free < sizeof(struct ocfs2_xattr_entry) +
+ OCFS2_XATTR_SIZE(name_len) +
+ OCFS2_XATTR_SIZE(xi->value_len)) {
+ ret = -ENOSPC;
+ goto out;
+ }
+ }
+
+ if (!xs->not_found) {
+ /* For existing extended attribute */
+ size_t size = OCFS2_XATTR_SIZE(name_len) +
+ OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size));
+ size_t offs = le16_to_cpu(xs->here->xe_name_offset);
+ void *val = xs->base + offs;
+
+ if (ocfs2_xattr_is_local(xs->here) && size == size_l) {
+ /* Replace existing local xattr with tree root */
+ ret = ocfs2_xattr_set_value_outside(inode, xi, xs,
+ offs);
+ if (ret < 0)
+ mlog_errno(ret);
+ goto out;
+ } else if (!ocfs2_xattr_is_local(xs->here)) {
+ /* For existing xattr which has value outside */
+ struct ocfs2_xattr_value_root *xv = NULL;
+ xv = (struct ocfs2_xattr_value_root *)(val +
+ OCFS2_XATTR_SIZE(name_len));
+
+ if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+ /*
+ * If new value need set outside also,
+ * first truncate old value to new value,
+ * then set new value with set_value_outside().
+ */
+ ret = ocfs2_xattr_value_truncate(inode,
+ xs->xattr_bh,
+ xv,
+ xi->value_len);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = __ocfs2_xattr_set_value_outside(inode,
+ xv,
+ xi->value,
+ xi->value_len);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_xattr_update_entry(inode,
+ xi,
+ xs,
+ offs);
+ if (ret < 0)
+ mlog_errno(ret);
+ goto out;
+ } else {
+ /*
+ * If new value need set in local,
+ * just trucate old value to zero.
+ */
+ ret = ocfs2_xattr_value_truncate(inode,
+ xs->xattr_bh,
+ xv,
+ 0);
+ if (ret < 0)
+ mlog_errno(ret);
+ }
+ }
+ }
+
+ handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+ OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ if (!(flag & OCFS2_INLINE_XATTR_FL)) {
+ /* set extended attribute in external block. */
+ ret = ocfs2_extend_trans(handle,
+ OCFS2_INODE_UPDATE_CREDITS +
+ OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+ ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+ }
+
+ /*
+ * Set value in local, include set tree root in local.
+ * This is the first step for value size >INLINE_SIZE.
+ */
+ ocfs2_xattr_set_entry_local(inode, &xi_l, xs, last, min_offs);
+
+ if (!(flag & OCFS2_INLINE_XATTR_FL)) {
+ ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+ }
+
+ if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) &&
+ (flag & OCFS2_INLINE_XATTR_FL)) {
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ unsigned int xattrsize = osb->s_xattr_inline_size;
+
+ /*
+ * Adjust extent record count or inline data size
+ * to reserve space for extended attribute.
+ */
+ if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ struct ocfs2_inline_data *idata = &di->id2.i_data;
+ le16_add_cpu(&idata->id_count, -xattrsize);
+ } else if (!(ocfs2_inode_is_fast_symlink(inode))) {
+ struct ocfs2_extent_list *el = &di->id2.i_list;
+ le16_add_cpu(&el->l_count, -(xattrsize /
+ sizeof(struct ocfs2_extent_rec)));
+ }
+ di->i_xattr_inline_size = cpu_to_le16(xattrsize);
+ }
+ /* Update xattr flag */
+ spin_lock(&oi->ip_lock);
+ oi->ip_dyn_features |= flag;
+ di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+ spin_unlock(&oi->ip_lock);
+ /* Update inode ctime */
+ inode->i_ctime = CURRENT_TIME;
+ di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+ di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+
+ ret = ocfs2_journal_dirty(handle, xs->inode_bh);
+ if (ret < 0)
+ mlog_errno(ret);
+
+out_commit:
+ ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+
+ if (!ret && xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+ /*
+ * Set value outside in B tree.
+ * This is the second step for value size > INLINE_SIZE.
+ */
+ size_t offs = le16_to_cpu(xs->here->xe_name_offset);
+ ret = ocfs2_xattr_set_value_outside(inode, xi, xs, offs);
+ if (ret < 0) {
+ int ret2;
+
+ mlog_errno(ret);
+ /*
+ * If set value outside failed, we have to clean
+ * the junk tree root we have already set in local.
+ */
+ ret2 = ocfs2_xattr_cleanup(inode, xi, xs, offs);
+ if (ret2 < 0)
+ mlog_errno(ret2);
+ }
+ }
+out:
+ return ret;
+
+}
+
+static int ocfs2_remove_value_outside(struct inode*inode,
+ struct buffer_head *bh,
+ struct ocfs2_xattr_header *header)
+{
+ int ret = 0, i;
+
+ for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
+ struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
+
+ if (!ocfs2_xattr_is_local(entry)) {
+ struct ocfs2_xattr_value_root *xv;
+ void *val;
+
+ val = (void *)header +
+ le16_to_cpu(entry->xe_name_offset);
+ xv = (struct ocfs2_xattr_value_root *)
+ (val + OCFS2_XATTR_SIZE(entry->xe_name_len));
+ ret = ocfs2_xattr_value_truncate(inode, bh, xv, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+ }
+ }
+
+ return ret;
+}
+
+static int ocfs2_xattr_ibody_remove(struct inode *inode,
+ struct buffer_head *di_bh)
+{
+
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_xattr_header *header;
+ int ret;
+
+ header = (struct ocfs2_xattr_header *)
+ ((void *)di + inode->i_sb->s_blocksize -
+ le16_to_cpu(di->i_xattr_inline_size));
+
+ ret = ocfs2_remove_value_outside(inode, di_bh, header);
+
+ return ret;
+}
+
+static int ocfs2_xattr_block_remove(struct inode *inode,
+ struct buffer_head *blk_bh)
+{
+ struct ocfs2_xattr_block *xb;
+ int ret = 0;
+
+ xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+ if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+ struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header);
+ ret = ocfs2_remove_value_outside(inode, blk_bh, header);
+ } else
+ ret = ocfs2_delete_xattr_index_block(inode, blk_bh);
+
+ return ret;
+}
+
+static int ocfs2_xattr_free_block(struct inode *inode,
+ u64 block)
+{
+ struct inode *xb_alloc_inode;
+ struct buffer_head *xb_alloc_bh = NULL;
+ struct buffer_head *blk_bh = NULL;
+ struct ocfs2_xattr_block *xb;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ handle_t *handle;
+ int ret = 0;
+ u64 blk, bg_blkno;
+ u16 bit;
+
+ ret = ocfs2_read_block(inode, block, &blk_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*Verify the signature of xattr block*/
+ if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
+ strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ ret = ocfs2_xattr_block_remove(inode, blk_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+ blk = le64_to_cpu(xb->xb_blkno);
+ bit = le16_to_cpu(xb->xb_suballoc_bit);
+ bg_blkno = ocfs2_which_suballoc_group(blk, bit);
+
+ xb_alloc_inode = ocfs2_get_system_file_inode(osb,
+ EXTENT_ALLOC_SYSTEM_INODE,
+ le16_to_cpu(xb->xb_suballoc_slot));
+ if (!xb_alloc_inode) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+ mutex_lock(&xb_alloc_inode->i_mutex);
+
+ ret = ocfs2_inode_lock(xb_alloc_inode, &xb_alloc_bh, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_mutex;
+ }
+
+ handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+ ret = ocfs2_free_suballoc_bits(handle, xb_alloc_inode, xb_alloc_bh,
+ bit, bg_blkno, 1);
+ if (ret < 0)
+ mlog_errno(ret);
+
+ ocfs2_commit_trans(osb, handle);
+out_unlock:
+ ocfs2_inode_unlock(xb_alloc_inode, 1);
+ brelse(xb_alloc_bh);
+out_mutex:
+ mutex_unlock(&xb_alloc_inode->i_mutex);
+ iput(xb_alloc_inode);
+out:
+ brelse(blk_bh);
+ return ret;
+}
+
+/*
+ * ocfs2_xattr_remove()
+ *
+ * Free extended attribute resources associated with this inode.
+ */
+int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
+{
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ handle_t *handle;
+ int ret;
+
+ if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+ return 0;
+
+ if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
+ return 0;
+
+ if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
+ ret = ocfs2_xattr_ibody_remove(inode, di_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ if (di->i_xattr_loc) {
+ ret = ocfs2_xattr_free_block(inode,
+ le64_to_cpu(di->i_xattr_loc));
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
+ OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+ ret = ocfs2_journal_access(handle, inode, di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ di->i_xattr_loc = 0;
+
+ spin_lock(&oi->ip_lock);
+ oi->ip_dyn_features &= ~(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL);
+ di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
+ spin_unlock(&oi->ip_lock);
+
+ ret = ocfs2_journal_dirty(handle, di_bh);
+ if (ret < 0)
+ mlog_errno(ret);
+out_commit:
+ ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+ return ret;
+}
+
+static int ocfs2_xattr_has_space_inline(struct inode *inode,
+ struct ocfs2_dinode *di)
+{
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ unsigned int xattrsize = OCFS2_SB(inode->i_sb)->s_xattr_inline_size;
+ int free;
+
+ if (xattrsize < OCFS2_MIN_XATTR_INLINE_SIZE)
+ return 0;
+
+ if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ struct ocfs2_inline_data *idata = &di->id2.i_data;
+ free = le16_to_cpu(idata->id_count) - le64_to_cpu(di->i_size);
+ } else if (ocfs2_inode_is_fast_symlink(inode)) {
+ free = ocfs2_fast_symlink_chars(inode->i_sb) -
+ le64_to_cpu(di->i_size);
+ } else {
+ struct ocfs2_extent_list *el = &di->id2.i_list;
+ free = (le16_to_cpu(el->l_count) -
+ le16_to_cpu(el->l_next_free_rec)) *
+ sizeof(struct ocfs2_extent_rec);
+ }
+ if (free >= xattrsize)
+ return 1;
+
+ return 0;
+}
+
+/*
+ * ocfs2_xattr_ibody_find()
+ *
+ * Find extended attribute in inode block and
+ * fill search info into struct ocfs2_xattr_search.
+ */
+static int ocfs2_xattr_ibody_find(struct inode *inode,
+ int name_index,
+ const char *name,
+ struct ocfs2_xattr_search *xs)
+{
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+ int ret;
+ int has_space = 0;
+
+ if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
+ return 0;
+
+ if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
+ down_read(&oi->ip_alloc_sem);
+ has_space = ocfs2_xattr_has_space_inline(inode, di);
+ up_read(&oi->ip_alloc_sem);
+ if (!has_space)
+ return 0;
+ }
+
+ xs->xattr_bh = xs->inode_bh;
+ xs->end = (void *)di + inode->i_sb->s_blocksize;
+ if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)
+ xs->header = (struct ocfs2_xattr_header *)
+ (xs->end - le16_to_cpu(di->i_xattr_inline_size));
+ else
+ xs->header = (struct ocfs2_xattr_header *)
+ (xs->end - OCFS2_SB(inode->i_sb)->s_xattr_inline_size);
+ xs->base = (void *)xs->header;
+ xs->here = xs->header->xh_entries;
+
+ /* Find the named attribute. */
+ if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) {
+ ret = ocfs2_xattr_find_entry(name_index, name, xs);
+ if (ret && ret != -ENODATA)
+ return ret;
+ xs->not_found = ret;
+ }
+
+ return 0;
+}
+
+/*
+ * ocfs2_xattr_ibody_set()
+ *
+ * Set, replace or remove an extended attribute into inode block.
+ *
+ */
+static int ocfs2_xattr_ibody_set(struct inode *inode,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xs)
+{
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+ int ret;
+
+ if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE)
+ return -ENOSPC;
+
+ down_write(&oi->ip_alloc_sem);
+ if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) {
+ if (!ocfs2_xattr_has_space_inline(inode, di)) {
+ ret = -ENOSPC;
+ goto out;
+ }
+ }
+
+ ret = ocfs2_xattr_set_entry(inode, xi, xs,
+ (OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL));
+out:
+ up_write(&oi->ip_alloc_sem);
+
+ return ret;
+}
+
+/*
+ * ocfs2_xattr_block_find()
+ *
+ * Find extended attribute in external block and
+ * fill search info into struct ocfs2_xattr_search.
+ */
+static int ocfs2_xattr_block_find(struct inode *inode,
+ int name_index,
+ const char *name,
+ struct ocfs2_xattr_search *xs)
+{
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+ struct buffer_head *blk_bh = NULL;
+ struct ocfs2_xattr_block *xb;
+ int ret = 0;
+
+ if (!di->i_xattr_loc)
+ return ret;
+
+ ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+ /*Verify the signature of xattr block*/
+ if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE,
+ strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) {
+ ret = -EFAULT;
+ goto cleanup;
+ }
+
+ xs->xattr_bh = blk_bh;
+ xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
+
+ if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
+ xs->header = &xb->xb_attrs.xb_header;
+ xs->base = (void *)xs->header;
+ xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size;
+ xs->here = xs->header->xh_entries;
+
+ ret = ocfs2_xattr_find_entry(name_index, name, xs);
+ } else
+ ret = ocfs2_xattr_index_block_find(inode, blk_bh,
+ name_index,
+ name, xs);
+
+ if (ret && ret != -ENODATA) {
+ xs->xattr_bh = NULL;
+ goto cleanup;
+ }
+ xs->not_found = ret;
+ return 0;
+cleanup:
+ brelse(blk_bh);
+
+ return ret;
+}
+
+/*
+ * When all the xattrs are deleted from index btree, the ocfs2_xattr_tree
+ * will be erased and ocfs2_xattr_block will have its ocfs2_xattr_header
+ * re-initialized.
+ */
+static int ocfs2_restore_xattr_block(struct inode *inode,
+ struct ocfs2_xattr_search *xs)
+{
+ int ret;
+ handle_t *handle;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_xattr_block *xb =
+ (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
+ struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list;
+ u16 xb_flags = le16_to_cpu(xb->xb_flags);
+
+ BUG_ON(!(xb_flags & OCFS2_XATTR_INDEXED) ||
+ le16_to_cpu(el->l_next_free_rec) != 0);
+
+ handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ handle = NULL;
+ goto out;
+ }
+
+ ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize -
+ offsetof(struct ocfs2_xattr_block, xb_attrs));
+
+ xb->xb_flags = cpu_to_le16(xb_flags & ~OCFS2_XATTR_INDEXED);
+
+ ocfs2_journal_dirty(handle, xs->xattr_bh);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+out:
+ return ret;
+}
+
+/*
+ * ocfs2_xattr_block_set()
+ *
+ * Set, replace or remove an extended attribute into external block.
+ *
+ */
+static int ocfs2_xattr_block_set(struct inode *inode,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xs)
+{
+ struct buffer_head *new_bh = NULL;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
+ struct ocfs2_alloc_context *meta_ac = NULL;
+ handle_t *handle = NULL;
+ struct ocfs2_xattr_block *xblk = NULL;
+ u16 suballoc_bit_start;
+ u32 num_got;
+ u64 first_blkno;
+ int ret;
+
+ if (!xs->xattr_bh) {
+ /*
+ * Alloc one external block for extended attribute
+ * outside of inode.
+ */
+ ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ handle = ocfs2_start_trans(osb,
+ OCFS2_XATTR_BLOCK_CREATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+ ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
+ &suballoc_bit_start, &num_got,
+ &first_blkno);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ new_bh = sb_getblk(inode->i_sb, first_blkno);
+ ocfs2_set_new_buffer_uptodate(inode, new_bh);
+
+ ret = ocfs2_journal_access(handle, inode, new_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ /* Initialize ocfs2_xattr_block */
+ xs->xattr_bh = new_bh;
+ xblk = (struct ocfs2_xattr_block *)new_bh->b_data;
+ memset(xblk, 0, inode->i_sb->s_blocksize);
+ strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE);
+ xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num);
+ xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+ xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation);
+ xblk->xb_blkno = cpu_to_le64(first_blkno);
+
+ xs->header = &xblk->xb_attrs.xb_header;
+ xs->base = (void *)xs->header;
+ xs->end = (void *)xblk + inode->i_sb->s_blocksize;
+ xs->here = xs->header->xh_entries;
+
+
+ ret = ocfs2_journal_dirty(handle, new_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+ di->i_xattr_loc = cpu_to_le64(first_blkno);
+ ret = ocfs2_journal_dirty(handle, xs->inode_bh);
+ if (ret < 0)
+ mlog_errno(ret);
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+out:
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+ if (ret < 0)
+ return ret;
+ } else
+ xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
+
+ if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) {
+ /* Set extended attribute into external block */
+ ret = ocfs2_xattr_set_entry(inode, xi, xs, OCFS2_HAS_XATTR_FL);
+ if (!ret || ret != -ENOSPC)
+ goto end;
+
+ ret = ocfs2_xattr_create_index_block(inode, xs);
+ if (ret)
+ goto end;
+ }
+
+ ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs);
+ if (!ret && xblk->xb_attrs.xb_root.xt_list.l_next_free_rec == 0)
+ ret = ocfs2_restore_xattr_block(inode, xs);
+
+end:
+
+ return ret;
+}
+
+/*
+ * ocfs2_xattr_set()
+ *
+ * Set, replace or remove an extended attribute for this inode.
+ * value is NULL to remove an existing extended attribute, else either
+ * create or replace an extended attribute.
+ */
+int ocfs2_xattr_set(struct inode *inode,
+ int name_index,
+ const char *name,
+ const void *value,
+ size_t value_len,
+ int flags)
+{
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_dinode *di;
+ int ret;
+ u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+ struct ocfs2_xattr_info xi = {
+ .name_index = name_index,
+ .name = name,
+ .value = value,
+ .value_len = value_len,
+ };
+
+ struct ocfs2_xattr_search xis = {
+ .not_found = -ENODATA,
+ };
+
+ struct ocfs2_xattr_search xbs = {
+ .not_found = -ENODATA,
+ };
+
+ if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+ return -EOPNOTSUPP;
+
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+ xis.inode_bh = xbs.inode_bh = di_bh;
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+
+ down_write(&OCFS2_I(inode)->ip_xattr_sem);
+ /*
+ * Scan inode and external block to find the same name
+ * extended attribute and collect search infomation.
+ */
+ ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis);
+ if (ret)
+ goto cleanup;
+ if (xis.not_found) {
+ ret = ocfs2_xattr_block_find(inode, name_index, name, &xbs);
+ if (ret)
+ goto cleanup;
+ }
+
+ if (xis.not_found && xbs.not_found) {
+ ret = -ENODATA;
+ if (flags & XATTR_REPLACE)
+ goto cleanup;
+ ret = 0;
+ if (!value)
+ goto cleanup;
+ } else {
+ ret = -EEXIST;
+ if (flags & XATTR_CREATE)
+ goto cleanup;
+ }
+
+ if (!value) {
+ /* Remove existing extended attribute */
+ if (!xis.not_found)
+ ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
+ else if (!xbs.not_found)
+ ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
+ } else {
+ /* We always try to set extended attribute into inode first*/
+ ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
+ if (!ret && !xbs.not_found) {
+ /*
+ * If succeed and that extended attribute existing in
+ * external block, then we will remove it.
+ */
+ xi.value = NULL;
+ xi.value_len = 0;
+ ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
+ } else if (ret == -ENOSPC) {
+ if (di->i_xattr_loc && !xbs.xattr_bh) {
+ ret = ocfs2_xattr_block_find(inode, name_index,
+ name, &xbs);
+ if (ret)
+ goto cleanup;
+ }
+ /*
+ * If no space in inode, we will set extended attribute
+ * into external block.
+ */
+ ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
+ if (ret)
+ goto cleanup;
+ if (!xis.not_found) {
+ /*
+ * If succeed and that extended attribute
+ * existing in inode, we will remove it.
+ */
+ xi.value = NULL;
+ xi.value_len = 0;
+ ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
+ }
+ }
+ }
+cleanup:
+ up_write(&OCFS2_I(inode)->ip_xattr_sem);
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+ brelse(xbs.xattr_bh);
+ for (i = 0; i < blk_per_bucket; i++)
+ brelse(xbs.bucket.bhs[i]);
+
+ return ret;
+}
+
+/*
+ * Find the xattr extent rec which may contains name_hash.
+ * e_cpos will be the first name hash of the xattr rec.
+ * el must be the ocfs2_xattr_header.xb_attrs.xb_root.xt_list.
+ */
+static int ocfs2_xattr_get_rec(struct inode *inode,
+ u32 name_hash,
+ u64 *p_blkno,
+ u32 *e_cpos,
+ u32 *num_clusters,
+ struct ocfs2_extent_list *el)
+{
+ int ret = 0, i;
+ struct buffer_head *eb_bh = NULL;
+ struct ocfs2_extent_block *eb;
+ struct ocfs2_extent_rec *rec = NULL;
+ u64 e_blkno = 0;
+
+ if (el->l_tree_depth) {
+ ret = ocfs2_find_leaf(inode, el, name_hash, &eb_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+ el = &eb->h_list;
+
+ if (el->l_tree_depth) {
+ ocfs2_error(inode->i_sb,
+ "Inode %lu has non zero tree depth in "
+ "xattr tree block %llu\n", inode->i_ino,
+ (unsigned long long)eb_bh->b_blocknr);
+ ret = -EROFS;
+ goto out;
+ }
+ }
+
+ for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
+ rec = &el->l_recs[i];
+
+ if (le32_to_cpu(rec->e_cpos) <= name_hash) {
+ e_blkno = le64_to_cpu(rec->e_blkno);
+ break;
+ }
+ }
+
+ if (!e_blkno) {
+ ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+ "record (%u, %u, 0) in xattr", inode->i_ino,
+ le32_to_cpu(rec->e_cpos),
+ ocfs2_rec_clusters(el, rec));
+ ret = -EROFS;
+ goto out;
+ }
+
+ *p_blkno = le64_to_cpu(rec->e_blkno);
+ *num_clusters = le16_to_cpu(rec->e_leaf_clusters);
+ if (e_cpos)
+ *e_cpos = le32_to_cpu(rec->e_cpos);
+out:
+ brelse(eb_bh);
+ return ret;
+}
+
+typedef int (xattr_bucket_func)(struct inode *inode,
+ struct ocfs2_xattr_bucket *bucket,
+ void *para);
+
+static int ocfs2_find_xe_in_bucket(struct inode *inode,
+ struct buffer_head *header_bh,
+ int name_index,
+ const char *name,
+ u32 name_hash,
+ u16 *xe_index,
+ int *found)
+{
+ int i, ret = 0, cmp = 1, block_off, new_offset;
+ struct ocfs2_xattr_header *xh =
+ (struct ocfs2_xattr_header *)header_bh->b_data;
+ size_t name_len = strlen(name);
+ struct ocfs2_xattr_entry *xe = NULL;
+ struct buffer_head *name_bh = NULL;
+ char *xe_name;
+
+ /*
+ * We don't use binary search in the bucket because there
+ * may be multiple entries with the same name hash.
+ */
+ for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+ xe = &xh->xh_entries[i];
+
+ if (name_hash > le32_to_cpu(xe->xe_name_hash))
+ continue;
+ else if (name_hash < le32_to_cpu(xe->xe_name_hash))
+ break;
+
+ cmp = name_index - ocfs2_xattr_get_type(xe);
+ if (!cmp)
+ cmp = name_len - xe->xe_name_len;
+ if (cmp)
+ continue;
+
+ ret = ocfs2_xattr_bucket_get_name_value(inode,
+ xh,
+ i,
+ &block_off,
+ &new_offset);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ ret = ocfs2_read_block(inode, header_bh->b_blocknr + block_off,
+ &name_bh);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+ xe_name = name_bh->b_data + new_offset;
+
+ cmp = memcmp(name, xe_name, name_len);
+ brelse(name_bh);
+ name_bh = NULL;
+
+ if (cmp == 0) {
+ *xe_index = i;
+ *found = 1;
+ ret = 0;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * Find the specified xattr entry in a series of buckets.
+ * This series start from p_blkno and last for num_clusters.
+ * The ocfs2_xattr_header.xh_num_buckets of the first bucket contains
+ * the num of the valid buckets.
+ *
+ * Return the buffer_head this xattr should reside in. And if the xattr's
+ * hash is in the gap of 2 buckets, return the lower bucket.
+ */
+static int ocfs2_xattr_bucket_find(struct inode *inode,
+ int name_index,
+ const char *name,
+ u32 name_hash,
+ u64 p_blkno,
+ u32 first_hash,
+ u32 num_clusters,
+ struct ocfs2_xattr_search *xs)
+{
+ int ret, found = 0;
+ struct buffer_head *bh = NULL;
+ struct buffer_head *lower_bh = NULL;
+ struct ocfs2_xattr_header *xh = NULL;
+ struct ocfs2_xattr_entry *xe = NULL;
+ u16 index = 0;
+ u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ int low_bucket = 0, bucket, high_bucket;
+ u32 last_hash;
+ u64 blkno;
+
+ ret = ocfs2_read_block(inode, p_blkno, &bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ xh = (struct ocfs2_xattr_header *)bh->b_data;
+ high_bucket = le16_to_cpu(xh->xh_num_buckets) - 1;
+
+ while (low_bucket <= high_bucket) {
+ brelse(bh);
+ bh = NULL;
+ bucket = (low_bucket + high_bucket) / 2;
+
+ blkno = p_blkno + bucket * blk_per_bucket;
+
+ ret = ocfs2_read_block(inode, blkno, &bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ xh = (struct ocfs2_xattr_header *)bh->b_data;
+ xe = &xh->xh_entries[0];
+ if (name_hash < le32_to_cpu(xe->xe_name_hash)) {
+ high_bucket = bucket - 1;
+ continue;
+ }
+
+ /*
+ * Check whether the hash of the last entry in our
+ * bucket is larger than the search one. for an empty
+ * bucket, the last one is also the first one.
+ */
+ if (xh->xh_count)
+ xe = &xh->xh_entries[le16_to_cpu(xh->xh_count) - 1];
+
+ last_hash = le32_to_cpu(xe->xe_name_hash);
+
+ /* record lower_bh which may be the insert place. */
+ brelse(lower_bh);
+ lower_bh = bh;
+ bh = NULL;
+
+ if (name_hash > le32_to_cpu(xe->xe_name_hash)) {
+ low_bucket = bucket + 1;
+ continue;
+ }
+
+ /* the searched xattr should reside in this bucket if exists. */
+ ret = ocfs2_find_xe_in_bucket(inode, lower_bh,
+ name_index, name, name_hash,
+ &index, &found);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ break;
+ }
+
+ /*
+ * Record the bucket we have found.
+ * When the xattr's hash value is in the gap of 2 buckets, we will
+ * always set it to the previous bucket.
+ */
+ if (!lower_bh) {
+ /*
+ * We can't find any bucket whose first name_hash is less
+ * than the find name_hash.
+ */
+ BUG_ON(bh->b_blocknr != p_blkno);
+ lower_bh = bh;
+ bh = NULL;
+ }
+ xs->bucket.bhs[0] = lower_bh;
+ xs->bucket.xh = (struct ocfs2_xattr_header *)
+ xs->bucket.bhs[0]->b_data;
+ lower_bh = NULL;
+
+ xs->header = xs->bucket.xh;
+ xs->base = xs->bucket.bhs[0]->b_data;
+ xs->end = xs->base + inode->i_sb->s_blocksize;
+
+ if (found) {
+ /*
+ * If we have found the xattr enty, read all the blocks in
+ * this bucket.
+ */
+ ret = ocfs2_read_blocks(inode, xs->bucket.bhs[0]->b_blocknr + 1,
+ blk_per_bucket - 1, &xs->bucket.bhs[1],
+ OCFS2_BH_CACHED);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ xs->here = &xs->header->xh_entries[index];
+ mlog(0, "find xattr %s in bucket %llu, entry = %u\n", name,
+ (unsigned long long)xs->bucket.bhs[0]->b_blocknr, index);
+ } else
+ ret = -ENODATA;
+
+out:
+ brelse(bh);
+ brelse(lower_bh);
+ return ret;
+}
+
+static int ocfs2_xattr_index_block_find(struct inode *inode,
+ struct buffer_head *root_bh,
+ int name_index,
+ const char *name,
+ struct ocfs2_xattr_search *xs)
+{
+ int ret;
+ struct ocfs2_xattr_block *xb =
+ (struct ocfs2_xattr_block *)root_bh->b_data;
+ struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root;
+ struct ocfs2_extent_list *el = &xb_root->xt_list;
+ u64 p_blkno = 0;
+ u32 first_hash, num_clusters = 0;
+ u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name));
+
+ if (le16_to_cpu(el->l_next_free_rec) == 0)
+ return -ENODATA;
+
+ mlog(0, "find xattr %s, hash = %u, index = %d in xattr tree\n",
+ name, name_hash, name_index);
+
+ ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &first_hash,
+ &num_clusters, el);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ BUG_ON(p_blkno == 0 || num_clusters == 0 || first_hash > name_hash);
+
+ mlog(0, "find xattr extent rec %u clusters from %llu, the first hash "
+ "in the rec is %u\n", num_clusters, p_blkno, first_hash);
+
+ ret = ocfs2_xattr_bucket_find(inode, name_index, name, name_hash,
+ p_blkno, first_hash, num_clusters, xs);
+
+out:
+ return ret;
+}
+
+static int ocfs2_iterate_xattr_buckets(struct inode *inode,
+ u64 blkno,
+ u32 clusters,
+ xattr_bucket_func *func,
+ void *para)
+{
+ int i, j, ret = 0;
+ int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb));
+ u32 num_buckets = clusters * bpc;
+ struct ocfs2_xattr_bucket bucket;
+
+ memset(&bucket, 0, sizeof(bucket));
+
+ mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n",
+ clusters, blkno);
+
+ for (i = 0; i < num_buckets; i++, blkno += blk_per_bucket) {
+ ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket,
+ bucket.bhs, OCFS2_BH_CACHED);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ bucket.xh = (struct ocfs2_xattr_header *)bucket.bhs[0]->b_data;
+ /*
+ * The real bucket num in this series of blocks is stored
+ * in the 1st bucket.
+ */
+ if (i == 0)
+ num_buckets = le16_to_cpu(bucket.xh->xh_num_buckets);
+
+ mlog(0, "iterating xattr bucket %llu, first hash %u\n", blkno,
+ le32_to_cpu(bucket.xh->xh_entries[0].xe_name_hash));
+ if (func) {
+ ret = func(inode, &bucket, para);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+ }
+
+ for (j = 0; j < blk_per_bucket; j++)
+ brelse(bucket.bhs[j]);
+ memset(&bucket, 0, sizeof(bucket));
+ }
+
+out:
+ for (j = 0; j < blk_per_bucket; j++)
+ brelse(bucket.bhs[j]);
+
+ return ret;
+}
+
+struct ocfs2_xattr_tree_list {
+ char *buffer;
+ size_t buffer_size;
+ size_t result;
+};
+
+static int ocfs2_xattr_bucket_get_name_value(struct inode *inode,
+ struct ocfs2_xattr_header *xh,
+ int index,
+ int *block_off,
+ int *new_offset)
+{
+ u16 name_offset;
+
+ if (index < 0 || index >= le16_to_cpu(xh->xh_count))
+ return -EINVAL;
+
+ name_offset = le16_to_cpu(xh->xh_entries[index].xe_name_offset);
+
+ *block_off = name_offset >> inode->i_sb->s_blocksize_bits;
+ *new_offset = name_offset % inode->i_sb->s_blocksize;
+
+ return 0;
+}
+
+static int ocfs2_list_xattr_bucket(struct inode *inode,
+ struct ocfs2_xattr_bucket *bucket,
+ void *para)
+{
+ int ret = 0, type;
+ struct ocfs2_xattr_tree_list *xl = (struct ocfs2_xattr_tree_list *)para;
+ int i, block_off, new_offset;
+ const char *prefix, *name;
+
+ for (i = 0 ; i < le16_to_cpu(bucket->xh->xh_count); i++) {
+ struct ocfs2_xattr_entry *entry = &bucket->xh->xh_entries[i];
+ type = ocfs2_xattr_get_type(entry);
+ prefix = ocfs2_xattr_prefix(type);
+
+ if (prefix) {
+ ret = ocfs2_xattr_bucket_get_name_value(inode,
+ bucket->xh,
+ i,
+ &block_off,
+ &new_offset);
+ if (ret)
+ break;
+
+ name = (const char *)bucket->bhs[block_off]->b_data +
+ new_offset;
+ ret = ocfs2_xattr_list_entry(xl->buffer,
+ xl->buffer_size,
+ &xl->result,
+ prefix, name,
+ entry->xe_name_len);
+ if (ret)
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
+ struct ocfs2_xattr_tree_root *xt,
+ char *buffer,
+ size_t buffer_size)
+{
+ struct ocfs2_extent_list *el = &xt->xt_list;
+ int ret = 0;
+ u32 name_hash = UINT_MAX, e_cpos = 0, num_clusters = 0;
+ u64 p_blkno = 0;
+ struct ocfs2_xattr_tree_list xl = {
+ .buffer = buffer,
+ .buffer_size = buffer_size,
+ .result = 0,
+ };
+
+ if (le16_to_cpu(el->l_next_free_rec) == 0)
+ return 0;
+
+ while (name_hash > 0) {
+ ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno,
+ &e_cpos, &num_clusters, el);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters,
+ ocfs2_list_xattr_bucket,
+ &xl);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (e_cpos == 0)
+ break;
+
+ name_hash = e_cpos - 1;
+ }
+
+ ret = xl.result;
+out:
+ return ret;
+}
+
+static int cmp_xe(const void *a, const void *b)
+{
+ const struct ocfs2_xattr_entry *l = a, *r = b;
+ u32 l_hash = le32_to_cpu(l->xe_name_hash);
+ u32 r_hash = le32_to_cpu(r->xe_name_hash);
+
+ if (l_hash > r_hash)
+ return 1;
+ if (l_hash < r_hash)
+ return -1;
+ return 0;
+}
+
+static void swap_xe(void *a, void *b, int size)
+{
+ struct ocfs2_xattr_entry *l = a, *r = b, tmp;
+
+ tmp = *l;
+ memcpy(l, r, sizeof(struct ocfs2_xattr_entry));
+ memcpy(r, &tmp, sizeof(struct ocfs2_xattr_entry));
+}
+
+/*
+ * When the ocfs2_xattr_block is filled up, new bucket will be created
+ * and all the xattr entries will be moved to the new bucket.
+ * Note: we need to sort the entries since they are not saved in order
+ * in the ocfs2_xattr_block.
+ */
+static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
+ struct buffer_head *xb_bh,
+ struct buffer_head *xh_bh,
+ struct buffer_head *data_bh)
+{
+ int i, blocksize = inode->i_sb->s_blocksize;
+ u16 offset, size, off_change;
+ struct ocfs2_xattr_entry *xe;
+ struct ocfs2_xattr_block *xb =
+ (struct ocfs2_xattr_block *)xb_bh->b_data;
+ struct ocfs2_xattr_header *xb_xh = &xb->xb_attrs.xb_header;
+ struct ocfs2_xattr_header *xh =
+ (struct ocfs2_xattr_header *)xh_bh->b_data;
+ u16 count = le16_to_cpu(xb_xh->xh_count);
+ char *target = xh_bh->b_data, *src = xb_bh->b_data;
+
+ mlog(0, "cp xattr from block %llu to bucket %llu\n",
+ (unsigned long long)xb_bh->b_blocknr,
+ (unsigned long long)xh_bh->b_blocknr);
+
+ memset(xh_bh->b_data, 0, blocksize);
+ if (data_bh)
+ memset(data_bh->b_data, 0, blocksize);
+ /*
+ * Since the xe_name_offset is based on ocfs2_xattr_header,
+ * there is a offset change corresponding to the change of
+ * ocfs2_xattr_header's position.
+ */
+ off_change = offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header);
+ xe = &xb_xh->xh_entries[count - 1];
+ offset = le16_to_cpu(xe->xe_name_offset) + off_change;
+ size = blocksize - offset;
+
+ /* copy all the names and values. */
+ if (data_bh)
+ target = data_bh->b_data;
+ memcpy(target + offset, src + offset, size);
+
+ /* Init new header now. */
+ xh->xh_count = xb_xh->xh_count;
+ xh->xh_num_buckets = cpu_to_le16(1);
+ xh->xh_name_value_len = cpu_to_le16(size);
+ xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE - size);
+
+ /* copy all the entries. */
+ target = xh_bh->b_data;
+ offset = offsetof(struct ocfs2_xattr_header, xh_entries);
+ size = count * sizeof(struct ocfs2_xattr_entry);
+ memcpy(target + offset, (char *)xb_xh + offset, size);
+
+ /* Change the xe offset for all the xe because of the move. */
+ off_change = OCFS2_XATTR_BUCKET_SIZE - blocksize +
+ offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header);
+ for (i = 0; i < count; i++)
+ le16_add_cpu(&xh->xh_entries[i].xe_name_offset, off_change);
+
+ mlog(0, "copy entry: start = %u, size = %u, offset_change = %u\n",
+ offset, size, off_change);
+
+ sort(target + offset, count, sizeof(struct ocfs2_xattr_entry),
+ cmp_xe, swap_xe);
+}
+
+/*
+ * After we move xattr from block to index btree, we have to
+ * update ocfs2_xattr_search to the new xe and base.
+ *
+ * When the entry is in xattr block, xattr_bh indicates the storage place.
+ * While if the entry is in index b-tree, "bucket" indicates the
+ * real place of the xattr.
+ */
+static int ocfs2_xattr_update_xattr_search(struct inode *inode,
+ struct ocfs2_xattr_search *xs,
+ struct buffer_head *old_bh,
+ struct buffer_head *new_bh)
+{
+ int ret = 0;
+ char *buf = old_bh->b_data;
+ struct ocfs2_xattr_block *old_xb = (struct ocfs2_xattr_block *)buf;
+ struct ocfs2_xattr_header *old_xh = &old_xb->xb_attrs.xb_header;
+ int i, blocksize = inode->i_sb->s_blocksize;
+ u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+ xs->bucket.bhs[0] = new_bh;
+ get_bh(new_bh);
+ xs->bucket.xh = (struct ocfs2_xattr_header *)xs->bucket.bhs[0]->b_data;
+ xs->header = xs->bucket.xh;
+
+ xs->base = new_bh->b_data;
+ xs->end = xs->base + inode->i_sb->s_blocksize;
+
+ if (!xs->not_found) {
+ if (OCFS2_XATTR_BUCKET_SIZE != blocksize) {
+ ret = ocfs2_read_blocks(inode,
+ xs->bucket.bhs[0]->b_blocknr + 1,
+ blk_per_bucket - 1, &xs->bucket.bhs[1],
+ OCFS2_BH_CACHED);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ i = xs->here - old_xh->xh_entries;
+ xs->here = &xs->header->xh_entries[i];
+ }
+ }
+
+ return ret;
+}
+
+static int ocfs2_xattr_create_index_block(struct inode *inode,
+ struct ocfs2_xattr_search *xs)
+{
+ int ret, credits = OCFS2_SUBALLOC_ALLOC;
+ u32 bit_off, len;
+ u64 blkno;
+ handle_t *handle;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_alloc_context *data_ac;
+ struct buffer_head *xh_bh = NULL, *data_bh = NULL;
+ struct buffer_head *xb_bh = xs->xattr_bh;
+ struct ocfs2_xattr_block *xb =
+ (struct ocfs2_xattr_block *)xb_bh->b_data;
+ struct ocfs2_xattr_tree_root *xr;
+ u16 xb_flags = le16_to_cpu(xb->xb_flags);
+ u16 bpb = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+ mlog(0, "create xattr index block for %llu\n",
+ (unsigned long long)xb_bh->b_blocknr);
+
+ BUG_ON(xb_flags & OCFS2_XATTR_INDEXED);
+
+ ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * XXX:
+ * We can use this lock for now, and maybe move to a dedicated mutex
+ * if performance becomes a problem later.
+ */
+ down_write(&oi->ip_alloc_sem);
+
+ /*
+ * 3 more credits, one for xattr block update, one for the 1st block
+ * of the new xattr bucket and one for the value/data.
+ */
+ credits += 3;
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out_sem;
+ }
+
+ ret = ocfs2_journal_access(handle, inode, xb_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ /*
+ * The bucket may spread in many blocks, and
+ * we will only touch the 1st block and the last block
+ * in the whole bucket(one for entry and one for data).
+ */
+ blkno = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
+
+ mlog(0, "allocate 1 cluster from %llu to xattr block\n", blkno);
+
+ xh_bh = sb_getblk(inode->i_sb, blkno);
+ if (!xh_bh) {
+ ret = -EIO;
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ocfs2_set_new_buffer_uptodate(inode, xh_bh);
+
+ ret = ocfs2_journal_access(handle, inode, xh_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ if (bpb > 1) {
+ data_bh = sb_getblk(inode->i_sb, blkno + bpb - 1);
+ if (!data_bh) {
+ ret = -EIO;
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ocfs2_set_new_buffer_uptodate(inode, data_bh);
+
+ ret = ocfs2_journal_access(handle, inode, data_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+ }
+
+ ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xh_bh, data_bh);
+
+ ocfs2_journal_dirty(handle, xh_bh);
+ if (data_bh)
+ ocfs2_journal_dirty(handle, data_bh);
+
+ ocfs2_xattr_update_xattr_search(inode, xs, xb_bh, xh_bh);
+
+ /* Change from ocfs2_xattr_header to ocfs2_xattr_tree_root */
+ memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize -
+ offsetof(struct ocfs2_xattr_block, xb_attrs));
+
+ xr = &xb->xb_attrs.xb_root;
+ xr->xt_clusters = cpu_to_le32(1);
+ xr->xt_last_eb_blk = 0;
+ xr->xt_list.l_tree_depth = 0;
+ xr->xt_list.l_count = cpu_to_le16(ocfs2_xattr_recs_per_xb(inode->i_sb));
+ xr->xt_list.l_next_free_rec = cpu_to_le16(1);
+
+ xr->xt_list.l_recs[0].e_cpos = 0;
+ xr->xt_list.l_recs[0].e_blkno = cpu_to_le64(blkno);
+ xr->xt_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1);
+
+ xb->xb_flags = cpu_to_le16(xb_flags | OCFS2_XATTR_INDEXED);
+
+ ret = ocfs2_journal_dirty(handle, xb_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+
+out_sem:
+ up_write(&oi->ip_alloc_sem);
+
+out:
+ if (data_ac)
+ ocfs2_free_alloc_context(data_ac);
+
+ brelse(xh_bh);
+ brelse(data_bh);
+
+ return ret;
+}
+
+static int cmp_xe_offset(const void *a, const void *b)
+{
+ const struct ocfs2_xattr_entry *l = a, *r = b;
+ u32 l_name_offset = le16_to_cpu(l->xe_name_offset);
+ u32 r_name_offset = le16_to_cpu(r->xe_name_offset);
+
+ if (l_name_offset < r_name_offset)
+ return 1;
+ if (l_name_offset > r_name_offset)
+ return -1;
+ return 0;
+}
+
+/*
+ * defrag a xattr bucket if we find that the bucket has some
+ * holes beteen name/value pairs.
+ * We will move all the name/value pairs to the end of the bucket
+ * so that we can spare some space for insertion.
+ */
+static int ocfs2_defrag_xattr_bucket(struct inode *inode,
+ struct ocfs2_xattr_bucket *bucket)
+{
+ int ret, i;
+ size_t end, offset, len, value_len;
+ struct ocfs2_xattr_header *xh;
+ char *entries, *buf, *bucket_buf = NULL;
+ u64 blkno = bucket->bhs[0]->b_blocknr;
+ u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ u16 xh_free_start;
+ size_t blocksize = inode->i_sb->s_blocksize;
+ handle_t *handle;
+ struct buffer_head **bhs;
+ struct ocfs2_xattr_entry *xe;
+
+ bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
+ GFP_NOFS);
+ if (!bhs)
+ return -ENOMEM;
+
+ ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket, bhs,
+ OCFS2_BH_CACHED);
+ if (ret)
+ goto out;
+
+ /*
+ * In order to make the operation more efficient and generic,
+ * we copy all the blocks into a contiguous memory and do the
+ * defragment there, so if anything is error, we will not touch
+ * the real block.
+ */
+ bucket_buf = kmalloc(OCFS2_XATTR_BUCKET_SIZE, GFP_NOFS);
+ if (!bucket_buf) {
+ ret = -EIO;
+ goto out;
+ }
+
+ buf = bucket_buf;
+ for (i = 0; i < blk_per_bucket; i++, buf += blocksize)
+ memcpy(buf, bhs[i]->b_data, blocksize);
+
+ handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), blk_per_bucket);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ handle = NULL;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ for (i = 0; i < blk_per_bucket; i++) {
+ ret = ocfs2_journal_access(handle, inode, bhs[i],
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto commit;
+ }
+ }
+
+ xh = (struct ocfs2_xattr_header *)bucket_buf;
+ entries = (char *)xh->xh_entries;
+ xh_free_start = le16_to_cpu(xh->xh_free_start);
+
+ mlog(0, "adjust xattr bucket in %llu, count = %u, "
+ "xh_free_start = %u, xh_name_value_len = %u.\n",
+ blkno, le16_to_cpu(xh->xh_count), xh_free_start,
+ le16_to_cpu(xh->xh_name_value_len));
+
+ /*
+ * sort all the entries by their offset.
+ * the largest will be the first, so that we can
+ * move them to the end one by one.
+ */
+ sort(entries, le16_to_cpu(xh->xh_count),
+ sizeof(struct ocfs2_xattr_entry),
+ cmp_xe_offset, swap_xe);
+
+ /* Move all name/values to the end of the bucket. */
+ xe = xh->xh_entries;
+ end = OCFS2_XATTR_BUCKET_SIZE;
+ for (i = 0; i < le16_to_cpu(xh->xh_count); i++, xe++) {
+ offset = le16_to_cpu(xe->xe_name_offset);
+ if (ocfs2_xattr_is_local(xe))
+ value_len = OCFS2_XATTR_SIZE(
+ le64_to_cpu(xe->xe_value_size));
+ else
+ value_len = OCFS2_XATTR_ROOT_SIZE;
+ len = OCFS2_XATTR_SIZE(xe->xe_name_len) + value_len;
+
+ /*
+ * We must make sure that the name/value pair
+ * exist in the same block. So adjust end to
+ * the previous block end if needed.
+ */
+ if (((end - len) / blocksize !=
+ (end - 1) / blocksize))
+ end = end - end % blocksize;
+
+ if (end > offset + len) {
+ memmove(bucket_buf + end - len,
+ bucket_buf + offset, len);
+ xe->xe_name_offset = cpu_to_le16(end - len);
+ }
+
+ mlog_bug_on_msg(end < offset + len, "Defrag check failed for "
+ "bucket %llu\n", (unsigned long long)blkno);
+
+ end -= len;
+ }
+
+ mlog_bug_on_msg(xh_free_start > end, "Defrag check failed for "
+ "bucket %llu\n", (unsigned long long)blkno);
+
+ if (xh_free_start == end)
+ goto commit;
+
+ memset(bucket_buf + xh_free_start, 0, end - xh_free_start);
+ xh->xh_free_start = cpu_to_le16(end);
+
+ /* sort the entries by their name_hash. */
+ sort(entries, le16_to_cpu(xh->xh_count),
+ sizeof(struct ocfs2_xattr_entry),
+ cmp_xe, swap_xe);
+
+ buf = bucket_buf;
+ for (i = 0; i < blk_per_bucket; i++, buf += blocksize) {
+ memcpy(bhs[i]->b_data, buf, blocksize);
+ ocfs2_journal_dirty(handle, bhs[i]);
+ }
+
+commit:
+ ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+out:
+
+ if (bhs) {
+ for (i = 0; i < blk_per_bucket; i++)
+ brelse(bhs[i]);
+ }
+ kfree(bhs);
+
+ kfree(bucket_buf);
+ return ret;
+}
+
+/*
+ * Move half nums of the xattr bucket in the previous cluster to this new
+ * cluster. We only touch the last cluster of the previous extend record.
+ *
+ * first_bh is the first buffer_head of a series of bucket in the same
+ * extent rec and header_bh is the header of one bucket in this cluster.
+ * They will be updated if we move the data header_bh contains to the new
+ * cluster. first_hash will be set as the 1st xe's name_hash of the new cluster.
+ */
+static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
+ handle_t *handle,
+ struct buffer_head **first_bh,
+ struct buffer_head **header_bh,
+ u64 new_blkno,
+ u64 prev_blkno,
+ u32 num_clusters,
+ u32 *first_hash)
+{
+ int i, ret, credits;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+ int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
+ int blocksize = inode->i_sb->s_blocksize;
+ struct buffer_head *old_bh, *new_bh, *prev_bh, *new_first_bh = NULL;
+ struct ocfs2_xattr_header *new_xh;
+ struct ocfs2_xattr_header *xh =
+ (struct ocfs2_xattr_header *)((*first_bh)->b_data);
+
+ BUG_ON(le16_to_cpu(xh->xh_num_buckets) < num_buckets);
+ BUG_ON(OCFS2_XATTR_BUCKET_SIZE == osb->s_clustersize);
+
+ prev_bh = *first_bh;
+ get_bh(prev_bh);
+ xh = (struct ocfs2_xattr_header *)prev_bh->b_data;
+
+ prev_blkno += (num_clusters - 1) * bpc + bpc / 2;
+
+ mlog(0, "move half of xattrs in cluster %llu to %llu\n",
+ prev_blkno, new_blkno);
+
+ /*
+ * We need to update the 1st half of the new cluster and
+ * 1 more for the update of the 1st bucket of the previous
+ * extent record.
+ */
+ credits = bpc / 2 + 1;
+ ret = ocfs2_extend_trans(handle, credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access(handle, inode, prev_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ for (i = 0; i < bpc / 2; i++, prev_blkno++, new_blkno++) {
+ old_bh = new_bh = NULL;
+ new_bh = sb_getblk(inode->i_sb, new_blkno);
+ if (!new_bh) {
+ ret = -EIO;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_set_new_buffer_uptodate(inode, new_bh);
+
+ ret = ocfs2_journal_access(handle, inode, new_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ brelse(new_bh);
+ goto out;
+ }
+
+ ret = ocfs2_read_block(inode, prev_blkno, &old_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ brelse(new_bh);
+ goto out;
+ }
+
+ memcpy(new_bh->b_data, old_bh->b_data, blocksize);
+
+ if (i == 0) {
+ new_xh = (struct ocfs2_xattr_header *)new_bh->b_data;
+ new_xh->xh_num_buckets = cpu_to_le16(num_buckets / 2);
+
+ if (first_hash)
+ *first_hash = le32_to_cpu(
+ new_xh->xh_entries[0].xe_name_hash);
+ new_first_bh = new_bh;
+ get_bh(new_first_bh);
+ }
+
+ ocfs2_journal_dirty(handle, new_bh);
+
+ if (*header_bh == old_bh) {
+ brelse(*header_bh);
+ *header_bh = new_bh;
+ get_bh(*header_bh);
+
+ brelse(*first_bh);
+ *first_bh = new_first_bh;
+ get_bh(*first_bh);
+ }
+ brelse(new_bh);
+ brelse(old_bh);
+ }
+
+ le16_add_cpu(&xh->xh_num_buckets, -(num_buckets / 2));
+
+ ocfs2_journal_dirty(handle, prev_bh);
+out:
+ brelse(prev_bh);
+ brelse(new_first_bh);
+ return ret;
+}
+
+static int ocfs2_read_xattr_bucket(struct inode *inode,
+ u64 blkno,
+ struct buffer_head **bhs,
+ int new)
+{
+ int ret = 0;
+ u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+ if (!new)
+ return ocfs2_read_blocks(inode, blkno,
+ blk_per_bucket, bhs,
+ OCFS2_BH_CACHED);
+
+ for (i = 0; i < blk_per_bucket; i++) {
+ bhs[i] = sb_getblk(inode->i_sb, blkno + i);
+ if (bhs[i] == NULL) {
+ ret = -EIO;
+ mlog_errno(ret);
+ break;
+ }
+ ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
+ }
+
+ return ret;
+}
+
+/*
+ * Move half num of the xattrs in old bucket(blk) to new bucket(new_blk).
+ * first_hash will record the 1st hash of the new bucket.
+ */
+static int ocfs2_half_xattr_bucket(struct inode *inode,
+ handle_t *handle,
+ u64 blk,
+ u64 new_blk,
+ u32 *first_hash,
+ int new_bucket_head)
+{
+ int ret, i;
+ u16 count, start, len, name_value_len, xe_len, name_offset;
+ u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ struct buffer_head **s_bhs, **t_bhs = NULL;
+ struct ocfs2_xattr_header *xh;
+ struct ocfs2_xattr_entry *xe;
+ int blocksize = inode->i_sb->s_blocksize;
+
+ mlog(0, "move half of xattrs from bucket %llu to %llu\n",
+ blk, new_blk);
+
+ s_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
+ if (!s_bhs)
+ return -ENOMEM;
+
+ ret = ocfs2_read_xattr_bucket(inode, blk, s_bhs, 0);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access(handle, inode, s_bhs[0],
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ t_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
+ if (!t_bhs) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = ocfs2_read_xattr_bucket(inode, new_blk, t_bhs, new_bucket_head);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ for (i = 0; i < blk_per_bucket; i++) {
+ ret = ocfs2_journal_access(handle, inode, t_bhs[i],
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ /* copy the whole bucket to the new first. */
+ for (i = 0; i < blk_per_bucket; i++)
+ memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
+
+ /* update the new bucket. */
+ xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data;
+ count = le16_to_cpu(xh->xh_count);
+ start = count / 2;
+
+ /*
+ * Calculate the total name/value len and xh_free_start for
+ * the old bucket first.
+ */
+ name_offset = OCFS2_XATTR_BUCKET_SIZE;
+ name_value_len = 0;
+ for (i = 0; i < start; i++) {
+ xe = &xh->xh_entries[i];
+ xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+ if (ocfs2_xattr_is_local(xe))
+ xe_len +=
+ OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
+ else
+ xe_len += OCFS2_XATTR_ROOT_SIZE;
+ name_value_len += xe_len;
+ if (le16_to_cpu(xe->xe_name_offset) < name_offset)
+ name_offset = le16_to_cpu(xe->xe_name_offset);
+ }
+
+ /*
+ * Now begin the modification to the new bucket.
+ *
+ * In the new bucket, We just move the xattr entry to the beginning
+ * and don't touch the name/value. So there will be some holes in the
+ * bucket, and they will be removed when ocfs2_defrag_xattr_bucket is
+ * called.
+ */
+ xe = &xh->xh_entries[start];
+ len = sizeof(struct ocfs2_xattr_entry) * (count - start);
+ mlog(0, "mv xattr entry len %d from %d to %d\n", len,
+ (int)((char *)xe - (char *)xh),
+ (int)((char *)xh->xh_entries - (char *)xh));
+ memmove((char *)xh->xh_entries, (char *)xe, len);
+ xe = &xh->xh_entries[count - start];
+ len = sizeof(struct ocfs2_xattr_entry) * start;
+ memset((char *)xe, 0, len);
+
+ le16_add_cpu(&xh->xh_count, -start);
+ le16_add_cpu(&xh->xh_name_value_len, -name_value_len);
+
+ /* Calculate xh_free_start for the new bucket. */
+ xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
+ for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+ xe = &xh->xh_entries[i];
+ xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+ if (ocfs2_xattr_is_local(xe))
+ xe_len +=
+ OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
+ else
+ xe_len += OCFS2_XATTR_ROOT_SIZE;
+ if (le16_to_cpu(xe->xe_name_offset) <
+ le16_to_cpu(xh->xh_free_start))
+ xh->xh_free_start = xe->xe_name_offset;
+ }
+
+ /* set xh->xh_num_buckets for the new xh. */
+ if (new_bucket_head)
+ xh->xh_num_buckets = cpu_to_le16(1);
+ else
+ xh->xh_num_buckets = 0;
+
+ for (i = 0; i < blk_per_bucket; i++) {
+ ocfs2_journal_dirty(handle, t_bhs[i]);
+ if (ret)
+ mlog_errno(ret);
+ }
+
+ /* store the first_hash of the new bucket. */
+ if (first_hash)
+ *first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
+
+ /*
+ * Now only update the 1st block of the old bucket.
+ * Please note that the entry has been sorted already above.
+ */
+ xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data;
+ memset(&xh->xh_entries[start], 0,
+ sizeof(struct ocfs2_xattr_entry) * (count - start));
+ xh->xh_count = cpu_to_le16(start);
+ xh->xh_free_start = cpu_to_le16(name_offset);
+ xh->xh_name_value_len = cpu_to_le16(name_value_len);
+
+ ocfs2_journal_dirty(handle, s_bhs[0]);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ if (s_bhs) {
+ for (i = 0; i < blk_per_bucket; i++)
+ brelse(s_bhs[i]);
+ }
+ kfree(s_bhs);
+
+ if (t_bhs) {
+ for (i = 0; i < blk_per_bucket; i++)
+ brelse(t_bhs[i]);
+ }
+ kfree(t_bhs);
+
+ return ret;
+}
+
+/*
+ * Copy xattr from one bucket to another bucket.
+ *
+ * The caller must make sure that the journal transaction
+ * has enough space for journaling.
+ */
+static int ocfs2_cp_xattr_bucket(struct inode *inode,
+ handle_t *handle,
+ u64 s_blkno,
+ u64 t_blkno,
+ int t_is_new)
+{
+ int ret, i;
+ int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ int blocksize = inode->i_sb->s_blocksize;
+ struct buffer_head **s_bhs, **t_bhs = NULL;
+
+ BUG_ON(s_blkno == t_blkno);
+
+ mlog(0, "cp bucket %llu to %llu, target is %d\n",
+ s_blkno, t_blkno, t_is_new);
+
+ s_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
+ GFP_NOFS);
+ if (!s_bhs)
+ return -ENOMEM;
+
+ ret = ocfs2_read_xattr_bucket(inode, s_blkno, s_bhs, 0);
+ if (ret)
+ goto out;
+
+ t_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
+ GFP_NOFS);
+ if (!t_bhs) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = ocfs2_read_xattr_bucket(inode, t_blkno, t_bhs, t_is_new);
+ if (ret)
+ goto out;
+
+ for (i = 0; i < blk_per_bucket; i++) {
+ ret = ocfs2_journal_access(handle, inode, t_bhs[i],
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret)
+ goto out;
+ }
+
+ for (i = 0; i < blk_per_bucket; i++) {
+ memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
+ ocfs2_journal_dirty(handle, t_bhs[i]);
+ }
+
+out:
+ if (s_bhs) {
+ for (i = 0; i < blk_per_bucket; i++)
+ brelse(s_bhs[i]);
+ }
+ kfree(s_bhs);
+
+ if (t_bhs) {
+ for (i = 0; i < blk_per_bucket; i++)
+ brelse(t_bhs[i]);
+ }
+ kfree(t_bhs);
+
+ return ret;
+}
+
+/*
+ * Copy one xattr cluster from src_blk to to_blk.
+ * The to_blk will become the first bucket header of the cluster, so its
+ * xh_num_buckets will be initialized as the bucket num in the cluster.
+ */
+static int ocfs2_cp_xattr_cluster(struct inode *inode,
+ handle_t *handle,
+ struct buffer_head *first_bh,
+ u64 src_blk,
+ u64 to_blk,
+ u32 *first_hash)
+{
+ int i, ret, credits;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+ int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
+ struct buffer_head *bh = NULL;
+ struct ocfs2_xattr_header *xh;
+ u64 to_blk_start = to_blk;
+
+ mlog(0, "cp xattrs from cluster %llu to %llu\n", src_blk, to_blk);
+
+ /*
+ * We need to update the new cluster and 1 more for the update of
+ * the 1st bucket of the previous extent rec.
+ */
+ credits = bpc + 1;
+ ret = ocfs2_extend_trans(handle, credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access(handle, inode, first_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ for (i = 0; i < num_buckets; i++) {
+ ret = ocfs2_cp_xattr_bucket(inode, handle,
+ src_blk, to_blk, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ src_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ to_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ }
+
+ /* update the old bucket header. */
+ xh = (struct ocfs2_xattr_header *)first_bh->b_data;
+ le16_add_cpu(&xh->xh_num_buckets, -num_buckets);
+
+ ocfs2_journal_dirty(handle, first_bh);
+
+ /* update the new bucket header. */
+ ret = ocfs2_read_block(inode, to_blk_start, &bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access(handle, inode, bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ xh = (struct ocfs2_xattr_header *)bh->b_data;
+ xh->xh_num_buckets = cpu_to_le16(num_buckets);
+
+ ocfs2_journal_dirty(handle, bh);
+
+ if (first_hash)
+ *first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
+out:
+ brelse(bh);
+ return ret;
+}
+
+/*
+ * Move half of the xattrs in this cluster to the new cluster.
+ * This function should only be called when bucket size == cluster size.
+ * Otherwise ocfs2_mv_xattr_bucket_cross_cluster should be used instead.
+ */
+static int ocfs2_half_xattr_cluster(struct inode *inode,
+ handle_t *handle,
+ u64 prev_blk,
+ u64 new_blk,
+ u32 *first_hash)
+{
+ u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ int ret, credits = 2 * blk_per_bucket;
+
+ BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize);
+
+ ret = ocfs2_extend_trans(handle, credits);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ /* Move half of the xattr in start_blk to the next bucket. */
+ return ocfs2_half_xattr_bucket(inode, handle, prev_blk,
+ new_blk, first_hash, 1);
+}
+
+/*
+ * Move some xattrs from the old cluster to the new one since they are not
+ * contiguous in ocfs2 xattr tree.
+ *
+ * new_blk starts a new separate cluster, and we will move some xattrs from
+ * prev_blk to it. v_start will be set as the first name hash value in this
+ * new cluster so that it can be used as e_cpos during tree insertion and
+ * don't collide with our original b-tree operations. first_bh and header_bh
+ * will also be updated since they will be used in ocfs2_extend_xattr_bucket
+ * to extend the insert bucket.
+ *
+ * The problem is how much xattr should we move to the new one and when should
+ * we update first_bh and header_bh?
+ * 1. If cluster size > bucket size, that means the previous cluster has more
+ * than 1 bucket, so just move half nums of bucket into the new cluster and
+ * update the first_bh and header_bh if the insert bucket has been moved
+ * to the new cluster.
+ * 2. If cluster_size == bucket_size:
+ * a) If the previous extent rec has more than one cluster and the insert
+ * place isn't in the last cluster, copy the entire last cluster to the
+ * new one. This time, we don't need to upate the first_bh and header_bh
+ * since they will not be moved into the new cluster.
+ * b) Otherwise, move the bottom half of the xattrs in the last cluster into
+ * the new one. And we set the extend flag to zero if the insert place is
+ * moved into the new allocated cluster since no extend is needed.
+ */
+static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
+ handle_t *handle,
+ struct buffer_head **first_bh,
+ struct buffer_head **header_bh,
+ u64 new_blk,
+ u64 prev_blk,
+ u32 prev_clusters,
+ u32 *v_start,
+ int *extend)
+{
+ int ret = 0;
+ int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+
+ mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n",
+ prev_blk, prev_clusters, new_blk);
+
+ if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1)
+ ret = ocfs2_mv_xattr_bucket_cross_cluster(inode,
+ handle,
+ first_bh,
+ header_bh,
+ new_blk,
+ prev_blk,
+ prev_clusters,
+ v_start);
+ else {
+ u64 last_blk = prev_blk + bpc * (prev_clusters - 1);
+
+ if (prev_clusters > 1 && (*header_bh)->b_blocknr != last_blk)
+ ret = ocfs2_cp_xattr_cluster(inode, handle, *first_bh,
+ last_blk, new_blk,
+ v_start);
+ else {
+ ret = ocfs2_half_xattr_cluster(inode, handle,
+ last_blk, new_blk,
+ v_start);
+
+ if ((*header_bh)->b_blocknr == last_blk && extend)
+ *extend = 0;
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * Add a new cluster for xattr storage.
+ *
+ * If the new cluster is contiguous with the previous one, it will be
+ * appended to the same extent record, and num_clusters will be updated.
+ * If not, we will insert a new extent for it and move some xattrs in
+ * the last cluster into the new allocated one.
+ * We also need to limit the maximum size of a btree leaf, otherwise we'll
+ * lose the benefits of hashing because we'll have to search large leaves.
+ * So now the maximum size is OCFS2_MAX_XATTR_TREE_LEAF_SIZE(or clustersize,
+ * if it's bigger).
+ *
+ * first_bh is the first block of the previous extent rec and header_bh
+ * indicates the bucket we will insert the new xattrs. They will be updated
+ * when the header_bh is moved into the new cluster.
+ */
+static int ocfs2_add_new_xattr_cluster(struct inode *inode,
+ struct buffer_head *root_bh,
+ struct buffer_head **first_bh,
+ struct buffer_head **header_bh,
+ u32 *num_clusters,
+ u32 prev_cpos,
+ u64 prev_blkno,
+ int *extend)
+{
+ int ret, credits;
+ u16 bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+ u32 prev_clusters = *num_clusters;
+ u32 clusters_to_add = 1, bit_off, num_bits, v_start = 0;
+ u64 block;
+ handle_t *handle = NULL;
+ struct ocfs2_alloc_context *data_ac = NULL;
+ struct ocfs2_alloc_context *meta_ac = NULL;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_extent_tree et;
+
+ mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, "
+ "previous xattr blkno = %llu\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ prev_cpos, prev_blkno);
+
+ ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh);
+
+ ret = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
+ &data_ac, &meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto leave;
+ }
+
+ credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el,
+ clusters_to_add);
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ handle = NULL;
+ mlog_errno(ret);
+ goto leave;
+ }
+
+ ret = ocfs2_journal_access(handle, inode, root_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto leave;
+ }
+
+ ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
+ clusters_to_add, &bit_off, &num_bits);
+ if (ret < 0) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+ goto leave;
+ }
+
+ BUG_ON(num_bits > clusters_to_add);
+
+ block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
+ mlog(0, "Allocating %u clusters at block %u for xattr in inode %llu\n",
+ num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+ if (prev_blkno + prev_clusters * bpc == block &&
+ (prev_clusters + num_bits) << osb->s_clustersize_bits <=
+ OCFS2_MAX_XATTR_TREE_LEAF_SIZE) {
+ /*
+ * If this cluster is contiguous with the old one and
+ * adding this new cluster, we don't surpass the limit of
+ * OCFS2_MAX_XATTR_TREE_LEAF_SIZE, cool. We will let it be
+ * initialized and used like other buckets in the previous
+ * cluster.
+ * So add it as a contiguous one. The caller will handle
+ * its init process.
+ */
+ v_start = prev_cpos + prev_clusters;
+ *num_clusters = prev_clusters + num_bits;
+ mlog(0, "Add contiguous %u clusters to previous extent rec.\n",
+ num_bits);
+ } else {
+ ret = ocfs2_adjust_xattr_cross_cluster(inode,
+ handle,
+ first_bh,
+ header_bh,
+ block,
+ prev_blkno,
+ prev_clusters,
+ &v_start,
+ extend);
+ if (ret) {
+ mlog_errno(ret);
+ goto leave;
+ }
+ }
+
+ if (handle->h_buffer_credits < credits) {
+ /*
+ * The journal has been restarted before, and don't
+ * have enough space for the insertion, so extend it
+ * here.
+ */
+ ret = ocfs2_extend_trans(handle, credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto leave;
+ }
+ }
+ mlog(0, "Insert %u clusters at block %llu for xattr at %u\n",
+ num_bits, block, v_start);
+ ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block,
+ num_bits, 0, meta_ac);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto leave;
+ }
+
+ ret = ocfs2_journal_dirty(handle, root_bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto leave;
+ }
+
+leave:
+ if (handle)
+ ocfs2_commit_trans(osb, handle);
+ if (data_ac)
+ ocfs2_free_alloc_context(data_ac);
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+
+ return ret;
+}
+
+/*
+ * Extend a new xattr bucket and move xattrs to the end one by one until
+ * We meet with start_bh. Only move half of the xattrs to the bucket after it.
+ */
+static int ocfs2_extend_xattr_bucket(struct inode *inode,
+ struct buffer_head *first_bh,
+ struct buffer_head *start_bh,
+ u32 num_clusters)
+{
+ int ret, credits;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ u64 start_blk = start_bh->b_blocknr, end_blk;
+ u32 num_buckets = num_clusters * ocfs2_xattr_buckets_per_cluster(osb);
+ handle_t *handle;
+ struct ocfs2_xattr_header *first_xh =
+ (struct ocfs2_xattr_header *)first_bh->b_data;
+ u16 bucket = le16_to_cpu(first_xh->xh_num_buckets);
+
+ mlog(0, "extend xattr bucket in %llu, xattr extend rec starting "
+ "from %llu, len = %u\n", start_blk,
+ (unsigned long long)first_bh->b_blocknr, num_clusters);
+
+ BUG_ON(bucket >= num_buckets);
+
+ end_blk = first_bh->b_blocknr + (bucket - 1) * blk_per_bucket;
+
+ /*
+ * We will touch all the buckets after the start_bh(include it).
+ * Add one more bucket and modify the first_bh.
+ */
+ credits = end_blk - start_blk + 2 * blk_per_bucket + 1;
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ handle = NULL;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access(handle, inode, first_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto commit;
+ }
+
+ while (end_blk != start_blk) {
+ ret = ocfs2_cp_xattr_bucket(inode, handle, end_blk,
+ end_blk + blk_per_bucket, 0);
+ if (ret)
+ goto commit;
+ end_blk -= blk_per_bucket;
+ }
+
+ /* Move half of the xattr in start_blk to the next bucket. */
+ ret = ocfs2_half_xattr_bucket(inode, handle, start_blk,
+ start_blk + blk_per_bucket, NULL, 0);
+
+ le16_add_cpu(&first_xh->xh_num_buckets, 1);
+ ocfs2_journal_dirty(handle, first_bh);
+
+commit:
+ ocfs2_commit_trans(osb, handle);
+out:
+ return ret;
+}
+
+/*
+ * Add new xattr bucket in an extent record and adjust the buckets accordingly.
+ * xb_bh is the ocfs2_xattr_block.
+ * We will move all the buckets starting from header_bh to the next place. As
+ * for this one, half num of its xattrs will be moved to the next one.
+ *
+ * We will allocate a new cluster if current cluster is full and adjust
+ * header_bh and first_bh if the insert place is moved to the new cluster.
+ */
+static int ocfs2_add_new_xattr_bucket(struct inode *inode,
+ struct buffer_head *xb_bh,
+ struct buffer_head *header_bh)
+{
+ struct ocfs2_xattr_header *first_xh = NULL;
+ struct buffer_head *first_bh = NULL;
+ struct ocfs2_xattr_block *xb =
+ (struct ocfs2_xattr_block *)xb_bh->b_data;
+ struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root;
+ struct ocfs2_extent_list *el = &xb_root->xt_list;
+ struct ocfs2_xattr_header *xh =
+ (struct ocfs2_xattr_header *)header_bh->b_data;
+ u32 name_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
+ struct super_block *sb = inode->i_sb;
+ struct ocfs2_super *osb = OCFS2_SB(sb);
+ int ret, num_buckets, extend = 1;
+ u64 p_blkno;
+ u32 e_cpos, num_clusters;
+
+ mlog(0, "Add new xattr bucket starting form %llu\n",
+ (unsigned long long)header_bh->b_blocknr);
+
+ /*
+ * Add refrence for header_bh here because it may be
+ * changed in ocfs2_add_new_xattr_cluster and we need
+ * to free it in the end.
+ */
+ get_bh(header_bh);
+
+ ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &e_cpos,
+ &num_clusters, el);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_read_block(inode, p_blkno, &first_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ num_buckets = ocfs2_xattr_buckets_per_cluster(osb) * num_clusters;
+ first_xh = (struct ocfs2_xattr_header *)first_bh->b_data;
+
+ if (num_buckets == le16_to_cpu(first_xh->xh_num_buckets)) {
+ ret = ocfs2_add_new_xattr_cluster(inode,
+ xb_bh,
+ &first_bh,
+ &header_bh,
+ &num_clusters,
+ e_cpos,
+ p_blkno,
+ &extend);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ if (extend)
+ ret = ocfs2_extend_xattr_bucket(inode,
+ first_bh,
+ header_bh,
+ num_clusters);
+ if (ret)
+ mlog_errno(ret);
+out:
+ brelse(first_bh);
+ brelse(header_bh);
+ return ret;
+}
+
+static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
+ struct ocfs2_xattr_bucket *bucket,
+ int offs)
+{
+ int block_off = offs >> inode->i_sb->s_blocksize_bits;
+
+ offs = offs % inode->i_sb->s_blocksize;
+ return bucket->bhs[block_off]->b_data + offs;
+}
+
+/*
+ * Handle the normal xattr set, including replace, delete and new.
+ *
+ * Note: "local" indicates the real data's locality. So we can't
+ * just its bucket locality by its length.
+ */
+static void ocfs2_xattr_set_entry_normal(struct inode *inode,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xs,
+ u32 name_hash,
+ int local)
+{
+ struct ocfs2_xattr_entry *last, *xe;
+ int name_len = strlen(xi->name);
+ struct ocfs2_xattr_header *xh = xs->header;
+ u16 count = le16_to_cpu(xh->xh_count), start;
+ size_t blocksize = inode->i_sb->s_blocksize;
+ char *val;
+ size_t offs, size, new_size;
+
+ last = &xh->xh_entries[count];
+ if (!xs->not_found) {
+ xe = xs->here;
+ offs = le16_to_cpu(xe->xe_name_offset);
+ if (ocfs2_xattr_is_local(xe))
+ size = OCFS2_XATTR_SIZE(name_len) +
+ OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
+ else
+ size = OCFS2_XATTR_SIZE(name_len) +
+ OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE);
+
+ /*
+ * If the new value will be stored outside, xi->value has been
+ * initalized as an empty ocfs2_xattr_value_root, and the same
+ * goes with xi->value_len, so we can set new_size safely here.
+ * See ocfs2_xattr_set_in_bucket.
+ */
+ new_size = OCFS2_XATTR_SIZE(name_len) +
+ OCFS2_XATTR_SIZE(xi->value_len);
+
+ le16_add_cpu(&xh->xh_name_value_len, -size);
+ if (xi->value) {
+ if (new_size > size)
+ goto set_new_name_value;
+
+ /* Now replace the old value with new one. */
+ if (local)
+ xe->xe_value_size = cpu_to_le64(xi->value_len);
+ else
+ xe->xe_value_size = 0;
+
+ val = ocfs2_xattr_bucket_get_val(inode,
+ &xs->bucket, offs);
+ memset(val + OCFS2_XATTR_SIZE(name_len), 0,
+ size - OCFS2_XATTR_SIZE(name_len));
+ if (OCFS2_XATTR_SIZE(xi->value_len) > 0)
+ memcpy(val + OCFS2_XATTR_SIZE(name_len),
+ xi->value, xi->value_len);
+
+ le16_add_cpu(&xh->xh_name_value_len, new_size);
+ ocfs2_xattr_set_local(xe, local);
+ return;
+ } else {
+ /*
+ * Remove the old entry if there is more than one.
+ * We don't remove the last entry so that we can
+ * use it to indicate the hash value of the empty
+ * bucket.
+ */
+ last -= 1;
+ le16_add_cpu(&xh->xh_count, -1);
+ if (xh->xh_count) {
+ memmove(xe, xe + 1,
+ (void *)last - (void *)xe);
+ memset(last, 0,
+ sizeof(struct ocfs2_xattr_entry));
+ } else
+ xh->xh_free_start =
+ cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE);
+
+ return;
+ }
+ } else {
+ /* find a new entry for insert. */
+ int low = 0, high = count - 1, tmp;
+ struct ocfs2_xattr_entry *tmp_xe;
+
+ while (low <= high && count) {
+ tmp = (low + high) / 2;
+ tmp_xe = &xh->xh_entries[tmp];
+
+ if (name_hash > le32_to_cpu(tmp_xe->xe_name_hash))
+ low = tmp + 1;
+ else if (name_hash <
+ le32_to_cpu(tmp_xe->xe_name_hash))
+ high = tmp - 1;
+ else {
+ low = tmp;
+ break;
+ }
+ }
+
+ xe = &xh->xh_entries[low];
+ if (low != count)
+ memmove(xe + 1, xe, (void *)last - (void *)xe);
+
+ le16_add_cpu(&xh->xh_count, 1);
+ memset(xe, 0, sizeof(struct ocfs2_xattr_entry));
+ xe->xe_name_hash = cpu_to_le32(name_hash);
+ xe->xe_name_len = name_len;
+ ocfs2_xattr_set_type(xe, xi->name_index);
+ }
+
+set_new_name_value:
+ /* Insert the new name+value. */
+ size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(xi->value_len);
+
+ /*
+ * We must make sure that the name/value pair
+ * exists in the same block.
+ */
+ offs = le16_to_cpu(xh->xh_free_start);
+ start = offs - size;
+
+ if (start >> inode->i_sb->s_blocksize_bits !=
+ (offs - 1) >> inode->i_sb->s_blocksize_bits) {
+ offs = offs - offs % blocksize;
+ xh->xh_free_start = cpu_to_le16(offs);
+ }
+
+ val = ocfs2_xattr_bucket_get_val(inode,
+ &xs->bucket, offs - size);
+ xe->xe_name_offset = cpu_to_le16(offs - size);
+
+ memset(val, 0, size);
+ memcpy(val, xi->name, name_len);
+ memcpy(val + OCFS2_XATTR_SIZE(name_len), xi->value, xi->value_len);
+
+ xe->xe_value_size = cpu_to_le64(xi->value_len);
+ ocfs2_xattr_set_local(xe, local);
+ xs->here = xe;
+ le16_add_cpu(&xh->xh_free_start, -size);
+ le16_add_cpu(&xh->xh_name_value_len, size);
+
+ return;
+}
+
+static int ocfs2_xattr_bucket_handle_journal(struct inode *inode,
+ handle_t *handle,
+ struct ocfs2_xattr_search *xs,
+ struct buffer_head **bhs,
+ u16 bh_num)
+{
+ int ret = 0, off, block_off;
+ struct ocfs2_xattr_entry *xe = xs->here;
+
+ /*
+ * First calculate all the blocks we should journal_access
+ * and journal_dirty. The first block should always be touched.
+ */
+ ret = ocfs2_journal_dirty(handle, bhs[0]);
+ if (ret)
+ mlog_errno(ret);
+
+ /* calc the data. */
+ off = le16_to_cpu(xe->xe_name_offset);
+ block_off = off >> inode->i_sb->s_blocksize_bits;
+ ret = ocfs2_journal_dirty(handle, bhs[block_off]);
+ if (ret)
+ mlog_errno(ret);
+
+ return ret;
+}
+
+/*
+ * Set the xattr entry in the specified bucket.
+ * The bucket is indicated by xs->bucket and it should have the enough
+ * space for the xattr insertion.
+ */
+static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xs,
+ u32 name_hash,
+ int local)
+{
+ int i, ret;
+ handle_t *handle = NULL;
+ u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n",
+ (unsigned long)xi->value_len, xi->name_index,
+ (unsigned long long)xs->bucket.bhs[0]->b_blocknr);
+
+ if (!xs->bucket.bhs[1]) {
+ ret = ocfs2_read_blocks(inode,
+ xs->bucket.bhs[0]->b_blocknr + 1,
+ blk_per_bucket - 1, &xs->bucket.bhs[1],
+ OCFS2_BH_CACHED);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ handle = ocfs2_start_trans(osb, blk_per_bucket);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ handle = NULL;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ for (i = 0; i < blk_per_bucket; i++) {
+ ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[i],
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash, local);
+
+ /*Only dirty the blocks we have touched in set xattr. */
+ ret = ocfs2_xattr_bucket_handle_journal(inode, handle, xs,
+ xs->bucket.bhs, blk_per_bucket);
+ if (ret)
+ mlog_errno(ret);
+out:
+ ocfs2_commit_trans(osb, handle);
+
+ return ret;
+}
+
+static int ocfs2_xattr_value_update_size(struct inode *inode,
+ struct buffer_head *xe_bh,
+ struct ocfs2_xattr_entry *xe,
+ u64 new_size)
+{
+ int ret;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ handle_t *handle = NULL;
+
+ handle = ocfs2_start_trans(osb, 1);
+ if (handle == NULL) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access(handle, inode, xe_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ xe->xe_value_size = cpu_to_le64(new_size);
+
+ ret = ocfs2_journal_dirty(handle, xe_bh);
+ if (ret < 0)
+ mlog_errno(ret);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+out:
+ return ret;
+}
+
+/*
+ * Truncate the specified xe_off entry in xattr bucket.
+ * bucket is indicated by header_bh and len is the new length.
+ * Both the ocfs2_xattr_value_root and the entry will be updated here.
+ *
+ * Copy the new updated xe and xe_value_root to new_xe and new_xv if needed.
+ */
+static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
+ struct buffer_head *header_bh,
+ int xe_off,
+ int len)
+{
+ int ret, offset;
+ u64 value_blk;
+ struct buffer_head *value_bh = NULL;
+ struct ocfs2_xattr_value_root *xv;
+ struct ocfs2_xattr_entry *xe;
+ struct ocfs2_xattr_header *xh =
+ (struct ocfs2_xattr_header *)header_bh->b_data;
+ size_t blocksize = inode->i_sb->s_blocksize;
+
+ xe = &xh->xh_entries[xe_off];
+
+ BUG_ON(!xe || ocfs2_xattr_is_local(xe));
+
+ offset = le16_to_cpu(xe->xe_name_offset) +
+ OCFS2_XATTR_SIZE(xe->xe_name_len);
+
+ value_blk = offset / blocksize;
+
+ /* We don't allow ocfs2_xattr_value to be stored in different block. */
+ BUG_ON(value_blk != (offset + OCFS2_XATTR_ROOT_SIZE - 1) / blocksize);
+ value_blk += header_bh->b_blocknr;
+
+ ret = ocfs2_read_block(inode, value_blk, &value_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ xv = (struct ocfs2_xattr_value_root *)
+ (value_bh->b_data + offset % blocksize);
+
+ mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n",
+ xe_off, (unsigned long long)header_bh->b_blocknr, len);
+ ret = ocfs2_xattr_value_truncate(inode, value_bh, xv, len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_xattr_value_update_size(inode, header_bh, xe, len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+out:
+ brelse(value_bh);
+ return ret;
+}
+
+static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
+ struct ocfs2_xattr_search *xs,
+ int len)
+{
+ int ret, offset;
+ struct ocfs2_xattr_entry *xe = xs->here;
+ struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base;
+
+ BUG_ON(!xs->bucket.bhs[0] || !xe || ocfs2_xattr_is_local(xe));
+
+ offset = xe - xh->xh_entries;
+ ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket.bhs[0],
+ offset, len);
+ if (ret)
+ mlog_errno(ret);
+
+ return ret;
+}
+
+static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
+ struct ocfs2_xattr_search *xs,
+ char *val,
+ int value_len)
+{
+ int offset;
+ struct ocfs2_xattr_value_root *xv;
+ struct ocfs2_xattr_entry *xe = xs->here;
+
+ BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe));
+
+ offset = le16_to_cpu(xe->xe_name_offset) +
+ OCFS2_XATTR_SIZE(xe->xe_name_len);
+
+ xv = (struct ocfs2_xattr_value_root *)(xs->base + offset);
+
+ return __ocfs2_xattr_set_value_outside(inode, xv, val, value_len);
+}
+
+static int ocfs2_rm_xattr_cluster(struct inode *inode,
+ struct buffer_head *root_bh,
+ u64 blkno,
+ u32 cpos,
+ u32 len)
+{
+ int ret;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct inode *tl_inode = osb->osb_tl_inode;
+ handle_t *handle;
+ struct ocfs2_xattr_block *xb =
+ (struct ocfs2_xattr_block *)root_bh->b_data;
+ struct ocfs2_alloc_context *meta_ac = NULL;
+ struct ocfs2_cached_dealloc_ctxt dealloc;
+ struct ocfs2_extent_tree et;
+
+ ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh);
+
+ ocfs2_init_dealloc_ctxt(&dealloc);
+
+ mlog(0, "rm xattr extent rec at %u len = %u, start from %llu\n",
+ cpos, len, (unsigned long long)blkno);
+
+ ocfs2_remove_xattr_clusters_from_cache(inode, blkno, len);
+
+ ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ mutex_lock(&tl_inode->i_mutex);
+
+ if (ocfs2_truncate_log_needs_flush(osb)) {
+ ret = __ocfs2_flush_truncate_log(osb);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
+ if (handle == NULL) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access(handle, inode, root_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
+ &dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, -len);
+
+ ret = ocfs2_journal_dirty(handle, root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_truncate_log_append(osb, handle, blkno, len);
+ if (ret)
+ mlog_errno(ret);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+out:
+ ocfs2_schedule_truncate_log_flush(osb, 1);
+
+ mutex_unlock(&tl_inode->i_mutex);
+
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+
+ ocfs2_run_deallocs(osb, &dealloc);
+
+ return ret;
+}
+
+static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
+ struct ocfs2_xattr_search *xs)
+{
+ handle_t *handle = NULL;
+ struct ocfs2_xattr_header *xh = xs->bucket.xh;
+ struct ocfs2_xattr_entry *last = &xh->xh_entries[
+ le16_to_cpu(xh->xh_count) - 1];
+ int ret = 0;
+
+ handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), 1);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ return;
+ }
+
+ ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[0],
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ /* Remove the old entry. */
+ memmove(xs->here, xs->here + 1,
+ (void *)last - (void *)xs->here);
+ memset(last, 0, sizeof(struct ocfs2_xattr_entry));
+ le16_add_cpu(&xh->xh_count, -1);
+
+ ret = ocfs2_journal_dirty(handle, xs->bucket.bhs[0]);
+ if (ret < 0)
+ mlog_errno(ret);
+out_commit:
+ ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+}
+
+/*
+ * Set the xattr name/value in the bucket specified in xs.
+ *
+ * As the new value in xi may be stored in the bucket or in an outside cluster,
+ * we divide the whole process into 3 steps:
+ * 1. insert name/value in the bucket(ocfs2_xattr_set_entry_in_bucket)
+ * 2. truncate of the outside cluster(ocfs2_xattr_bucket_value_truncate_xs)
+ * 3. Set the value to the outside cluster(ocfs2_xattr_bucket_set_value_outside)
+ * 4. If the clusters for the new outside value can't be allocated, we need
+ * to free the xattr we allocated in set.
+ */
+static int ocfs2_xattr_set_in_bucket(struct inode *inode,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xs)
+{
+ int ret, local = 1;
+ size_t value_len;
+ char *val = (char *)xi->value;
+ struct ocfs2_xattr_entry *xe = xs->here;
+ u32 name_hash = ocfs2_xattr_name_hash(inode, xi->name,
+ strlen(xi->name));
+
+ if (!xs->not_found && !ocfs2_xattr_is_local(xe)) {
+ /*
+ * We need to truncate the xattr storage first.
+ *
+ * If both the old and new value are stored to
+ * outside block, we only need to truncate
+ * the storage and then set the value outside.
+ *
+ * If the new value should be stored within block,
+ * we should free all the outside block first and
+ * the modification to the xattr block will be done
+ * by following steps.
+ */
+ if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
+ value_len = xi->value_len;
+ else
+ value_len = 0;
+
+ ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
+ value_len);
+ if (ret)
+ goto out;
+
+ if (value_len)
+ goto set_value_outside;
+ }
+
+ value_len = xi->value_len;
+ /* So we have to handle the inside block change now. */
+ if (value_len > OCFS2_XATTR_INLINE_SIZE) {
+ /*
+ * If the new value will be stored outside of block,
+ * initalize a new empty value root and insert it first.
+ */
+ local = 0;
+ xi->value = &def_xv;
+ xi->value_len = OCFS2_XATTR_ROOT_SIZE;
+ }
+
+ ret = ocfs2_xattr_set_entry_in_bucket(inode, xi, xs, name_hash, local);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (value_len <= OCFS2_XATTR_INLINE_SIZE)
+ goto out;
+
+ /* allocate the space now for the outside block storage. */
+ ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
+ value_len);
+ if (ret) {
+ mlog_errno(ret);
+
+ if (xs->not_found) {
+ /*
+ * We can't allocate enough clusters for outside
+ * storage and we have allocated xattr already,
+ * so need to remove it.
+ */
+ ocfs2_xattr_bucket_remove_xs(inode, xs);
+ }
+ goto out;
+ }
+
+set_value_outside:
+ ret = ocfs2_xattr_bucket_set_value_outside(inode, xs, val, value_len);
+out:
+ return ret;
+}
+
+/* check whether the xattr bucket is filled up with the same hash value. */
+static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
+ struct ocfs2_xattr_bucket *bucket)
+{
+ struct ocfs2_xattr_header *xh = bucket->xh;
+
+ if (xh->xh_entries[le16_to_cpu(xh->xh_count) - 1].xe_name_hash ==
+ xh->xh_entries[0].xe_name_hash) {
+ mlog(ML_ERROR, "Too much hash collision in xattr bucket %llu, "
+ "hash = %u\n",
+ (unsigned long long)bucket->bhs[0]->b_blocknr,
+ le32_to_cpu(xh->xh_entries[0].xe_name_hash));
+ return -ENOSPC;
+ }
+
+ return 0;
+}
+
+static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xs)
+{
+ struct ocfs2_xattr_header *xh;
+ struct ocfs2_xattr_entry *xe;
+ u16 count, header_size, xh_free_start;
+ int i, free, max_free, need, old;
+ size_t value_size = 0, name_len = strlen(xi->name);
+ size_t blocksize = inode->i_sb->s_blocksize;
+ int ret, allocation = 0;
+ u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+ mlog_entry("Set xattr %s in xattr index block\n", xi->name);
+
+try_again:
+ xh = xs->header;
+ count = le16_to_cpu(xh->xh_count);
+ xh_free_start = le16_to_cpu(xh->xh_free_start);
+ header_size = sizeof(struct ocfs2_xattr_header) +
+ count * sizeof(struct ocfs2_xattr_entry);
+ max_free = OCFS2_XATTR_BUCKET_SIZE -
+ le16_to_cpu(xh->xh_name_value_len) - header_size;
+
+ mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size "
+ "of %u which exceed block size\n",
+ (unsigned long long)xs->bucket.bhs[0]->b_blocknr,
+ header_size);
+
+ if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE)
+ value_size = OCFS2_XATTR_ROOT_SIZE;
+ else if (xi->value)
+ value_size = OCFS2_XATTR_SIZE(xi->value_len);
+
+ if (xs->not_found)
+ need = sizeof(struct ocfs2_xattr_entry) +
+ OCFS2_XATTR_SIZE(name_len) + value_size;
+ else {
+ need = value_size + OCFS2_XATTR_SIZE(name_len);
+
+ /*
+ * We only replace the old value if the new length is smaller
+ * than the old one. Otherwise we will allocate new space in the
+ * bucket to store it.
+ */
+ xe = xs->here;
+ if (ocfs2_xattr_is_local(xe))
+ old = OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size));
+ else
+ old = OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE);
+
+ if (old >= value_size)
+ need = 0;
+ }
+
+ free = xh_free_start - header_size;
+ /*
+ * We need to make sure the new name/value pair
+ * can exist in the same block.
+ */
+ if (xh_free_start % blocksize < need)
+ free -= xh_free_start % blocksize;
+
+ mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, "
+ "need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len ="
+ " %u\n", xs->not_found,
+ (unsigned long long)xs->bucket.bhs[0]->b_blocknr,
+ free, need, max_free, le16_to_cpu(xh->xh_free_start),
+ le16_to_cpu(xh->xh_name_value_len));
+
+ if (free < need || count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
+ if (need <= max_free &&
+ count < ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
+ /*
+ * We can create the space by defragment. Since only the
+ * name/value will be moved, the xe shouldn't be changed
+ * in xs.
+ */
+ ret = ocfs2_defrag_xattr_bucket(inode, &xs->bucket);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ xh_free_start = le16_to_cpu(xh->xh_free_start);
+ free = xh_free_start - header_size;
+ if (xh_free_start % blocksize < need)
+ free -= xh_free_start % blocksize;
+
+ if (free >= need)
+ goto xattr_set;
+
+ mlog(0, "Can't get enough space for xattr insert by "
+ "defragment. Need %u bytes, but we have %d, so "
+ "allocate new bucket for it.\n", need, free);
+ }
+
+ /*
+ * We have to add new buckets or clusters and one
+ * allocation should leave us enough space for insert.
+ */
+ BUG_ON(allocation);
+
+ /*
+ * We do not allow for overlapping ranges between buckets. And
+ * the maximum number of collisions we will allow for then is
+ * one bucket's worth, so check it here whether we need to
+ * add a new bucket for the insert.
+ */
+ ret = ocfs2_check_xattr_bucket_collision(inode, &xs->bucket);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_add_new_xattr_bucket(inode,
+ xs->xattr_bh,
+ xs->bucket.bhs[0]);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ for (i = 0; i < blk_per_bucket; i++)
+ brelse(xs->bucket.bhs[i]);
+
+ memset(&xs->bucket, 0, sizeof(xs->bucket));
+
+ ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
+ xi->name_index,
+ xi->name, xs);
+ if (ret && ret != -ENODATA)
+ goto out;
+ xs->not_found = ret;
+ allocation = 1;
+ goto try_again;
+ }
+
+xattr_set:
+ ret = ocfs2_xattr_set_in_bucket(inode, xi, xs);
+out:
+ mlog_exit(ret);
+ return ret;
+}
+
+static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
+ struct ocfs2_xattr_bucket *bucket,
+ void *para)
+{
+ int ret = 0;
+ struct ocfs2_xattr_header *xh = bucket->xh;
+ u16 i;
+ struct ocfs2_xattr_entry *xe;
+
+ for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
+ xe = &xh->xh_entries[i];
+ if (ocfs2_xattr_is_local(xe))
+ continue;
+
+ ret = ocfs2_xattr_bucket_value_truncate(inode,
+ bucket->bhs[0],
+ i, 0);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static int ocfs2_delete_xattr_index_block(struct inode *inode,
+ struct buffer_head *xb_bh)
+{
+ struct ocfs2_xattr_block *xb =
+ (struct ocfs2_xattr_block *)xb_bh->b_data;
+ struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list;
+ int ret = 0;
+ u32 name_hash = UINT_MAX, e_cpos, num_clusters;
+ u64 p_blkno;
+
+ if (le16_to_cpu(el->l_next_free_rec) == 0)
+ return 0;
+
+ while (name_hash > 0) {
+ ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno,
+ &e_cpos, &num_clusters, el);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters,
+ ocfs2_delete_xattr_in_bucket,
+ NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_rm_xattr_cluster(inode, xb_bh,
+ p_blkno, e_cpos, num_clusters);
+ if (ret) {
+ mlog_errno(ret);
+ break;
+ }
+
+ if (e_cpos == 0)
+ break;
+
+ name_hash = e_cpos - 1;
+ }
+
+out:
+ return ret;
+}
+
+/*
+ * 'trusted' attributes support
+ */
+
+#define XATTR_TRUSTED_PREFIX "trusted."
+
+static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list,
+ size_t list_size, const char *name,
+ size_t name_len)
+{
+ const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX) - 1;
+ const size_t total_len = prefix_len + name_len + 1;
+
+ if (list && total_len <= list_size) {
+ memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
+ memcpy(list + prefix_len, name, name_len);
+ list[prefix_len + name_len] = '\0';
+ }
+ return total_len;
+}
+
+static int ocfs2_xattr_trusted_get(struct inode *inode, const char *name,
+ void *buffer, size_t size)
+{
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+ return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_TRUSTED, name,
+ buffer, size);
+}
+
+static int ocfs2_xattr_trusted_set(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+
+ return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_TRUSTED, name, value,
+ size, flags);
+}
+
+struct xattr_handler ocfs2_xattr_trusted_handler = {
+ .prefix = XATTR_TRUSTED_PREFIX,
+ .list = ocfs2_xattr_trusted_list,
+ .get = ocfs2_xattr_trusted_get,
+ .set = ocfs2_xattr_trusted_set,
+};
+
+
+/*
+ * 'user' attributes support
+ */
+
+#define XATTR_USER_PREFIX "user."
+
+static size_t ocfs2_xattr_user_list(struct inode *inode, char *list,
+ size_t list_size, const char *name,
+ size_t name_len)
+{
+ const size_t prefix_len = sizeof(XATTR_USER_PREFIX) - 1;
+ const size_t total_len = prefix_len + name_len + 1;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
+ return 0;
+
+ if (list && total_len <= list_size) {
+ memcpy(list, XATTR_USER_PREFIX, prefix_len);
+ memcpy(list + prefix_len, name, name_len);
+ list[prefix_len + name_len] = '\0';
+ }
+ return total_len;
+}
+
+static int ocfs2_xattr_user_get(struct inode *inode, const char *name,
+ void *buffer, size_t size)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+ if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
+ return -EOPNOTSUPP;
+ return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_USER, name,
+ buffer, size);
+}
+
+static int ocfs2_xattr_user_set(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+ if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
+ return -EOPNOTSUPP;
+
+ return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_USER, name, value,
+ size, flags);
+}
+
+struct xattr_handler ocfs2_xattr_user_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .list = ocfs2_xattr_user_list,
+ .get = ocfs2_xattr_user_get,
+ .set = ocfs2_xattr_user_set,
+};
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
new file mode 100644
index 00000000000..c25c7c62a05
--- /dev/null
+++ b/fs/ocfs2/xattr.h
@@ -0,0 +1,68 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * xattr.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef OCFS2_XATTR_H
+#define OCFS2_XATTR_H
+
+#include <linux/init.h>
+#include <linux/xattr.h>
+
+enum ocfs2_xattr_type {
+ OCFS2_XATTR_INDEX_USER = 1,
+ OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS,
+ OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT,
+ OCFS2_XATTR_INDEX_TRUSTED,
+ OCFS2_XATTR_INDEX_SECURITY,
+ OCFS2_XATTR_MAX
+};
+
+extern struct xattr_handler ocfs2_xattr_user_handler;
+extern struct xattr_handler ocfs2_xattr_trusted_handler;
+
+extern ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
+extern int ocfs2_xattr_get(struct inode *, int, const char *, void *, size_t);
+extern int ocfs2_xattr_set(struct inode *, int, const char *, const void *,
+ size_t, int);
+extern int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh);
+extern struct xattr_handler *ocfs2_xattr_handlers[];
+
+static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
+{
+ return (1 << osb->s_clustersize_bits) / OCFS2_XATTR_BUCKET_SIZE;
+}
+
+static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb)
+{
+ return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits);
+}
+
+static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
+{
+ u16 len = sb->s_blocksize -
+ offsetof(struct ocfs2_xattr_header, xh_entries);
+
+ return len / sizeof(struct ocfs2_xattr_entry);
+}
+#endif /* OCFS2_XATTR_H */
diff --git a/fs/omfs/Makefile b/fs/omfs/Makefile
new file mode 100644
index 00000000000..8b82b63f112
--- /dev/null
+++ b/fs/omfs/Makefile
@@ -0,0 +1,4 @@
+
+obj-$(CONFIG_OMFS_FS) += omfs.o
+
+omfs-y := bitmap.o dir.o file.o inode.o
diff --git a/fs/omfs/bitmap.c b/fs/omfs/bitmap.c
new file mode 100644
index 00000000000..e1c0ec0ae98
--- /dev/null
+++ b/fs/omfs/bitmap.c
@@ -0,0 +1,193 @@
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <asm/div64.h>
+#include "omfs.h"
+
+unsigned long omfs_count_free(struct super_block *sb)
+{
+ unsigned int i;
+ unsigned long sum = 0;
+ struct omfs_sb_info *sbi = OMFS_SB(sb);
+ int nbits = sb->s_blocksize * 8;
+
+ for (i = 0; i < sbi->s_imap_size; i++)
+ sum += nbits - bitmap_weight(sbi->s_imap[i], nbits);
+
+ return sum;
+}
+
+/*
+ * Counts the run of zero bits starting at bit up to max.
+ * It handles the case where a run might spill over a buffer.
+ * Called with bitmap lock.
+ */
+static int count_run(unsigned long **addr, int nbits,
+ int addrlen, int bit, int max)
+{
+ int count = 0;
+ int x;
+
+ for (; addrlen > 0; addrlen--, addr++) {
+ x = find_next_bit(*addr, nbits, bit);
+ count += x - bit;
+
+ if (x < nbits || count > max)
+ return min(count, max);
+
+ bit = 0;
+ }
+ return min(count, max);
+}
+
+/*
+ * Sets or clears the run of count bits starting with bit.
+ * Called with bitmap lock.
+ */
+static int set_run(struct super_block *sb, int map,
+ int nbits, int bit, int count, int set)
+{
+ int i;
+ int err;
+ struct buffer_head *bh;
+ struct omfs_sb_info *sbi = OMFS_SB(sb);
+
+ err = -ENOMEM;
+ bh = sb_bread(sb, clus_to_blk(sbi, sbi->s_bitmap_ino) + map);
+ if (!bh)
+ goto out;
+
+ for (i = 0; i < count; i++, bit++) {
+ if (bit >= nbits) {
+ bit = 0;
+ map++;
+
+ mark_buffer_dirty(bh);
+ brelse(bh);
+ bh = sb_bread(sb,
+ clus_to_blk(sbi, sbi->s_bitmap_ino) + map);
+ if (!bh)
+ goto out;
+ }
+ if (set) {
+ set_bit(bit, sbi->s_imap[map]);
+ set_bit(bit, (unsigned long *)bh->b_data);
+ } else {
+ clear_bit(bit, sbi->s_imap[map]);
+ clear_bit(bit, (unsigned long *)bh->b_data);
+ }
+ }
+ mark_buffer_dirty(bh);
+ brelse(bh);
+ err = 0;
+out:
+ return err;
+}
+
+/*
+ * Tries to allocate exactly one block. Returns true if sucessful.
+ */
+int omfs_allocate_block(struct super_block *sb, u64 block)
+{
+ struct buffer_head *bh;
+ struct omfs_sb_info *sbi = OMFS_SB(sb);
+ int bits_per_entry = 8 * sb->s_blocksize;
+ unsigned int map, bit;
+ int ret = 0;
+ u64 tmp;
+
+ tmp = block;
+ bit = do_div(tmp, bits_per_entry);
+ map = tmp;
+
+ mutex_lock(&sbi->s_bitmap_lock);
+ if (map >= sbi->s_imap_size || test_and_set_bit(bit, sbi->s_imap[map]))
+ goto out;
+
+ if (sbi->s_bitmap_ino > 0) {
+ bh = sb_bread(sb, clus_to_blk(sbi, sbi->s_bitmap_ino) + map);
+ if (!bh)
+ goto out;
+
+ set_bit(bit, (unsigned long *)bh->b_data);
+ mark_buffer_dirty(bh);
+ brelse(bh);
+ }
+ ret = 1;
+out:
+ mutex_unlock(&sbi->s_bitmap_lock);
+ return ret;
+}
+
+
+/*
+ * Tries to allocate a set of blocks. The request size depends on the
+ * type: for inodes, we must allocate sbi->s_mirrors blocks, and for file
+ * blocks, we try to allocate sbi->s_clustersize, but can always get away
+ * with just one block.
+ */
+int omfs_allocate_range(struct super_block *sb,
+ int min_request,
+ int max_request,
+ u64 *return_block,
+ int *return_size)
+{
+ struct omfs_sb_info *sbi = OMFS_SB(sb);
+ int bits_per_entry = 8 * sb->s_blocksize;
+ int ret = 0;
+ int i, run, bit;
+
+ mutex_lock(&sbi->s_bitmap_lock);
+ for (i = 0; i < sbi->s_imap_size; i++) {
+ bit = 0;
+ while (bit < bits_per_entry) {
+ bit = find_next_zero_bit(sbi->s_imap[i], bits_per_entry,
+ bit);
+
+ if (bit == bits_per_entry)
+ break;
+
+ run = count_run(&sbi->s_imap[i], bits_per_entry,
+ sbi->s_imap_size-i, bit, max_request);
+
+ if (run >= min_request)
+ goto found;
+ bit += run;
+ }
+ }
+ ret = -ENOSPC;
+ goto out;
+
+found:
+ *return_block = i * bits_per_entry + bit;
+ *return_size = run;
+ ret = set_run(sb, i, bits_per_entry, bit, run, 1);
+
+out:
+ mutex_unlock(&sbi->s_bitmap_lock);
+ return ret;
+}
+
+/*
+ * Clears count bits starting at a given block.
+ */
+int omfs_clear_range(struct super_block *sb, u64 block, int count)
+{
+ struct omfs_sb_info *sbi = OMFS_SB(sb);
+ int bits_per_entry = 8 * sb->s_blocksize;
+ u64 tmp;
+ unsigned int map, bit;
+ int ret;
+
+ tmp = block;
+ bit = do_div(tmp, bits_per_entry);
+ map = tmp;
+
+ if (map >= sbi->s_imap_size)
+ return 0;
+
+ mutex_lock(&sbi->s_bitmap_lock);
+ ret = set_run(sb, map, bits_per_entry, bit, count, 0);
+ mutex_unlock(&sbi->s_bitmap_lock);
+ return ret;
+}
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
new file mode 100644
index 00000000000..c0757e99887
--- /dev/null
+++ b/fs/omfs/dir.c
@@ -0,0 +1,504 @@
+/*
+ * OMFS (as used by RIO Karma) directory operations.
+ * Copyright (C) 2005 Bob Copeland <me@bobcopeland.com>
+ * Released under GPL v2.
+ */
+
+#include <linux/fs.h>
+#include <linux/ctype.h>
+#include <linux/buffer_head.h>
+#include "omfs.h"
+
+static int omfs_hash(const char *name, int namelen, int mod)
+{
+ int i, hash = 0;
+ for (i = 0; i < namelen; i++)
+ hash ^= tolower(name[i]) << (i % 24);
+ return hash % mod;
+}
+
+/*
+ * Finds the bucket for a given name and reads the containing block;
+ * *ofs is set to the offset of the first list entry.
+ */
+static struct buffer_head *omfs_get_bucket(struct inode *dir,
+ const char *name, int namelen, int *ofs)
+{
+ int nbuckets = (dir->i_size - OMFS_DIR_START)/8;
+ int block = clus_to_blk(OMFS_SB(dir->i_sb), dir->i_ino);
+ int bucket = omfs_hash(name, namelen, nbuckets);
+
+ *ofs = OMFS_DIR_START + bucket * 8;
+ return sb_bread(dir->i_sb, block);
+}
+
+static struct buffer_head *omfs_scan_list(struct inode *dir, u64 block,
+ const char *name, int namelen,
+ u64 *prev_block)
+{
+ struct buffer_head *bh;
+ struct omfs_inode *oi;
+ int err = -ENOENT;
+ *prev_block = ~0;
+
+ while (block != ~0) {
+ bh = sb_bread(dir->i_sb,
+ clus_to_blk(OMFS_SB(dir->i_sb), block));
+ if (!bh) {
+ err = -EIO;
+ goto err;
+ }
+
+ oi = (struct omfs_inode *) bh->b_data;
+ if (omfs_is_bad(OMFS_SB(dir->i_sb), &oi->i_head, block)) {
+ brelse(bh);
+ goto err;
+ }
+
+ if (strncmp(oi->i_name, name, namelen) == 0)
+ return bh;
+
+ *prev_block = block;
+ block = be64_to_cpu(oi->i_sibling);
+ brelse(bh);
+ }
+err:
+ return ERR_PTR(err);
+}
+
+static struct buffer_head *omfs_find_entry(struct inode *dir,
+ const char *name, int namelen)
+{
+ struct buffer_head *bh;
+ int ofs;
+ u64 block, dummy;
+
+ bh = omfs_get_bucket(dir, name, namelen, &ofs);
+ if (!bh)
+ return ERR_PTR(-EIO);
+
+ block = be64_to_cpu(*((__be64 *) &bh->b_data[ofs]));
+ brelse(bh);
+
+ return omfs_scan_list(dir, block, name, namelen, &dummy);
+}
+
+int omfs_make_empty(struct inode *inode, struct super_block *sb)
+{
+ struct omfs_sb_info *sbi = OMFS_SB(sb);
+ int block = clus_to_blk(sbi, inode->i_ino);
+ struct buffer_head *bh;
+ struct omfs_inode *oi;
+
+ bh = sb_bread(sb, block);
+ if (!bh)
+ return -ENOMEM;
+
+ memset(bh->b_data, 0, sizeof(struct omfs_inode));
+
+ if (inode->i_mode & S_IFDIR) {
+ memset(&bh->b_data[OMFS_DIR_START], 0xff,
+ sbi->s_sys_blocksize - OMFS_DIR_START);
+ } else
+ omfs_make_empty_table(bh, OMFS_EXTENT_START);
+
+ oi = (struct omfs_inode *) bh->b_data;
+ oi->i_head.h_self = cpu_to_be64(inode->i_ino);
+ oi->i_sibling = ~cpu_to_be64(0ULL);
+
+ mark_buffer_dirty(bh);
+ brelse(bh);
+ return 0;
+}
+
+static int omfs_add_link(struct dentry *dentry, struct inode *inode)
+{
+ struct inode *dir = dentry->d_parent->d_inode;
+ const char *name = dentry->d_name.name;
+ int namelen = dentry->d_name.len;
+ struct omfs_inode *oi;
+ struct buffer_head *bh;
+ u64 block;
+ __be64 *entry;
+ int ofs;
+
+ /* just prepend to head of queue in proper bucket */
+ bh = omfs_get_bucket(dir, name, namelen, &ofs);
+ if (!bh)
+ goto out;
+
+ entry = (__be64 *) &bh->b_data[ofs];
+ block = be64_to_cpu(*entry);
+ *entry = cpu_to_be64(inode->i_ino);
+ mark_buffer_dirty(bh);
+ brelse(bh);
+
+ /* now set the sibling and parent pointers on the new inode */
+ bh = sb_bread(dir->i_sb, clus_to_blk(OMFS_SB(dir->i_sb), inode->i_ino));
+ if (!bh)
+ goto out;
+
+ oi = (struct omfs_inode *) bh->b_data;
+ memcpy(oi->i_name, name, namelen);
+ memset(oi->i_name + namelen, 0, OMFS_NAMELEN - namelen);
+ oi->i_sibling = cpu_to_be64(block);
+ oi->i_parent = cpu_to_be64(dir->i_ino);
+ mark_buffer_dirty(bh);
+ brelse(bh);
+
+ dir->i_ctime = CURRENT_TIME_SEC;
+
+ /* mark affected inodes dirty to rebuild checksums */
+ mark_inode_dirty(dir);
+ mark_inode_dirty(inode);
+ return 0;
+out:
+ return -ENOMEM;
+}
+
+static int omfs_delete_entry(struct dentry *dentry)
+{
+ struct inode *dir = dentry->d_parent->d_inode;
+ struct inode *dirty;
+ const char *name = dentry->d_name.name;
+ int namelen = dentry->d_name.len;
+ struct omfs_inode *oi;
+ struct buffer_head *bh, *bh2;
+ __be64 *entry, next;
+ u64 block, prev;
+ int ofs;
+ int err = -ENOMEM;
+
+ /* delete the proper node in the bucket's linked list */
+ bh = omfs_get_bucket(dir, name, namelen, &ofs);
+ if (!bh)
+ goto out;
+
+ entry = (__be64 *) &bh->b_data[ofs];
+ block = be64_to_cpu(*entry);
+
+ bh2 = omfs_scan_list(dir, block, name, namelen, &prev);
+ if (IS_ERR(bh2)) {
+ err = PTR_ERR(bh2);
+ goto out_free_bh;
+ }
+
+ oi = (struct omfs_inode *) bh2->b_data;
+ next = oi->i_sibling;
+ brelse(bh2);
+
+ if (prev != ~0) {
+ /* found in middle of list, get list ptr */
+ brelse(bh);
+ bh = sb_bread(dir->i_sb,
+ clus_to_blk(OMFS_SB(dir->i_sb), prev));
+ if (!bh)
+ goto out;
+
+ oi = (struct omfs_inode *) bh->b_data;
+ entry = &oi->i_sibling;
+ }
+
+ *entry = next;
+ mark_buffer_dirty(bh);
+
+ if (prev != ~0) {
+ dirty = omfs_iget(dir->i_sb, prev);
+ if (!IS_ERR(dirty)) {
+ mark_inode_dirty(dirty);
+ iput(dirty);
+ }
+ }
+
+ err = 0;
+out_free_bh:
+ brelse(bh);
+out:
+ return err;
+}
+
+static int omfs_dir_is_empty(struct inode *inode)
+{
+ int nbuckets = (inode->i_size - OMFS_DIR_START) / 8;
+ struct buffer_head *bh;
+ u64 *ptr;
+ int i;
+
+ bh = sb_bread(inode->i_sb, clus_to_blk(OMFS_SB(inode->i_sb),
+ inode->i_ino));
+
+ if (!bh)
+ return 0;
+
+ ptr = (u64 *) &bh->b_data[OMFS_DIR_START];
+
+ for (i = 0; i < nbuckets; i++, ptr++)
+ if (*ptr != ~0)
+ break;
+
+ brelse(bh);
+ return *ptr != ~0;
+}
+
+static int omfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+ int ret;
+ struct inode *inode = dentry->d_inode;
+
+ ret = omfs_delete_entry(dentry);
+ if (ret)
+ goto end_unlink;
+
+ inode_dec_link_count(inode);
+ mark_inode_dirty(dir);
+
+end_unlink:
+ return ret;
+}
+
+static int omfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ int err = -ENOTEMPTY;
+ struct inode *inode = dentry->d_inode;
+
+ if (omfs_dir_is_empty(inode)) {
+ err = omfs_unlink(dir, dentry);
+ if (!err)
+ inode_dec_link_count(inode);
+ }
+ return err;
+}
+
+static int omfs_add_node(struct inode *dir, struct dentry *dentry, int mode)
+{
+ int err;
+ struct inode *inode = omfs_new_inode(dir, mode);
+
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ err = omfs_make_empty(inode, dir->i_sb);
+ if (err)
+ goto out_free_inode;
+
+ err = omfs_add_link(dentry, inode);
+ if (err)
+ goto out_free_inode;
+
+ d_instantiate(dentry, inode);
+ return 0;
+
+out_free_inode:
+ iput(inode);
+ return err;
+}
+
+static int omfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+ return omfs_add_node(dir, dentry, mode | S_IFDIR);
+}
+
+static int omfs_create(struct inode *dir, struct dentry *dentry, int mode,
+ struct nameidata *nd)
+{
+ return omfs_add_node(dir, dentry, mode | S_IFREG);
+}
+
+static struct dentry *omfs_lookup(struct inode *dir, struct dentry *dentry,
+ struct nameidata *nd)
+{
+ struct buffer_head *bh;
+ struct inode *inode = NULL;
+
+ if (dentry->d_name.len > OMFS_NAMELEN)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ bh = omfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len);
+ if (!IS_ERR(bh)) {
+ struct omfs_inode *oi = (struct omfs_inode *)bh->b_data;
+ ino_t ino = be64_to_cpu(oi->i_head.h_self);
+ brelse(bh);
+ inode = omfs_iget(dir->i_sb, ino);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+ }
+ d_add(dentry, inode);
+ return NULL;
+}
+
+/* sanity check block's self pointer */
+int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header,
+ u64 fsblock)
+{
+ int is_bad;
+ u64 ino = be64_to_cpu(header->h_self);
+ is_bad = ((ino != fsblock) || (ino < sbi->s_root_ino) ||
+ (ino > sbi->s_num_blocks));
+
+ if (is_bad)
+ printk(KERN_WARNING "omfs: bad hash chain detected\n");
+
+ return is_bad;
+}
+
+static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir,
+ u64 fsblock, int hindex)
+{
+ struct inode *dir = filp->f_dentry->d_inode;
+ struct buffer_head *bh;
+ struct omfs_inode *oi;
+ u64 self;
+ int res = 0;
+ unsigned char d_type;
+
+ /* follow chain in this bucket */
+ while (fsblock != ~0) {
+ bh = sb_bread(dir->i_sb, clus_to_blk(OMFS_SB(dir->i_sb),
+ fsblock));
+ if (!bh)
+ goto out;
+
+ oi = (struct omfs_inode *) bh->b_data;
+ if (omfs_is_bad(OMFS_SB(dir->i_sb), &oi->i_head, fsblock)) {
+ brelse(bh);
+ goto out;
+ }
+
+ self = fsblock;
+ fsblock = be64_to_cpu(oi->i_sibling);
+
+ /* skip visited nodes */
+ if (hindex) {
+ hindex--;
+ brelse(bh);
+ continue;
+ }
+
+ d_type = (oi->i_type == OMFS_DIR) ? DT_DIR : DT_REG;
+
+ res = filldir(dirent, oi->i_name, strnlen(oi->i_name,
+ OMFS_NAMELEN), filp->f_pos, self, d_type);
+ if (res == 0)
+ filp->f_pos++;
+ brelse(bh);
+ }
+out:
+ return res;
+}
+
+static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ struct inode *new_inode = new_dentry->d_inode;
+ struct inode *old_inode = old_dentry->d_inode;
+ struct buffer_head *bh;
+ int is_dir;
+ int err;
+
+ is_dir = S_ISDIR(old_inode->i_mode);
+
+ if (new_inode) {
+ /* overwriting existing file/dir */
+ err = -ENOTEMPTY;
+ if (is_dir && !omfs_dir_is_empty(new_inode))
+ goto out;
+
+ err = -ENOENT;
+ bh = omfs_find_entry(new_dir, new_dentry->d_name.name,
+ new_dentry->d_name.len);
+ if (IS_ERR(bh))
+ goto out;
+ brelse(bh);
+
+ err = omfs_unlink(new_dir, new_dentry);
+ if (err)
+ goto out;
+ }
+
+ /* since omfs locates files by name, we need to unlink _before_
+ * adding the new link or we won't find the old one */
+ inode_inc_link_count(old_inode);
+ err = omfs_unlink(old_dir, old_dentry);
+ if (err) {
+ inode_dec_link_count(old_inode);
+ goto out;
+ }
+
+ err = omfs_add_link(new_dentry, old_inode);
+ if (err)
+ goto out;
+
+ old_inode->i_ctime = CURRENT_TIME_SEC;
+out:
+ return err;
+}
+
+static int omfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+ struct inode *dir = filp->f_dentry->d_inode;
+ struct buffer_head *bh;
+ loff_t offset, res;
+ unsigned int hchain, hindex;
+ int nbuckets;
+ u64 fsblock;
+ int ret = -EINVAL;
+
+ if (filp->f_pos >> 32)
+ goto success;
+
+ switch ((unsigned long) filp->f_pos) {
+ case 0:
+ if (filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR) < 0)
+ goto success;
+ filp->f_pos++;
+ /* fall through */
+ case 1:
+ if (filldir(dirent, "..", 2, 1,
+ parent_ino(filp->f_dentry), DT_DIR) < 0)
+ goto success;
+ filp->f_pos = 1 << 20;
+ /* fall through */
+ }
+
+ nbuckets = (dir->i_size - OMFS_DIR_START) / 8;
+
+ /* high 12 bits store bucket + 1 and low 20 bits store hash index */
+ hchain = (filp->f_pos >> 20) - 1;
+ hindex = filp->f_pos & 0xfffff;
+
+ bh = sb_bread(dir->i_sb, clus_to_blk(OMFS_SB(dir->i_sb), dir->i_ino));
+ if (!bh)
+ goto out;
+
+ offset = OMFS_DIR_START + hchain * 8;
+
+ for (; hchain < nbuckets; hchain++, offset += 8) {
+ fsblock = be64_to_cpu(*((__be64 *) &bh->b_data[offset]));
+
+ res = omfs_fill_chain(filp, dirent, filldir, fsblock, hindex);
+ hindex = 0;
+ if (res < 0)
+ break;
+
+ filp->f_pos = (hchain+2) << 20;
+ }
+ brelse(bh);
+success:
+ ret = 0;
+out:
+ return ret;
+}
+
+struct inode_operations omfs_dir_inops = {
+ .lookup = omfs_lookup,
+ .mkdir = omfs_mkdir,
+ .rename = omfs_rename,
+ .create = omfs_create,
+ .unlink = omfs_unlink,
+ .rmdir = omfs_rmdir,
+};
+
+struct file_operations omfs_dir_operations = {
+ .read = generic_read_dir,
+ .readdir = omfs_readdir,
+};
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
new file mode 100644
index 00000000000..834b2331f6b
--- /dev/null
+++ b/fs/omfs/file.c
@@ -0,0 +1,365 @@
+/*
+ * OMFS (as used by RIO Karma) file operations.
+ * Copyright (C) 2005 Bob Copeland <me@bobcopeland.com>
+ * Released under GPL v2.
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include "omfs.h"
+
+static int omfs_sync_file(struct file *file, struct dentry *dentry,
+ int datasync)
+{
+ struct inode *inode = dentry->d_inode;
+ int err;
+
+ err = sync_mapping_buffers(inode->i_mapping);
+ if (!(inode->i_state & I_DIRTY))
+ return err;
+ if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+ return err;
+ err |= omfs_sync_inode(inode);
+ return err ? -EIO : 0;
+}
+
+static u32 omfs_max_extents(struct omfs_sb_info *sbi, int offset)
+{
+ return (sbi->s_sys_blocksize - offset -
+ sizeof(struct omfs_extent)) /
+ sizeof(struct omfs_extent_entry) + 1;
+}
+
+void omfs_make_empty_table(struct buffer_head *bh, int offset)
+{
+ struct omfs_extent *oe = (struct omfs_extent *) &bh->b_data[offset];
+
+ oe->e_next = ~cpu_to_be64(0ULL);
+ oe->e_extent_count = cpu_to_be32(1),
+ oe->e_fill = cpu_to_be32(0x22),
+ oe->e_entry.e_cluster = ~cpu_to_be64(0ULL);
+ oe->e_entry.e_blocks = ~cpu_to_be64(0ULL);
+}
+
+int omfs_shrink_inode(struct inode *inode)
+{
+ struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb);
+ struct omfs_extent *oe;
+ struct omfs_extent_entry *entry;
+ struct buffer_head *bh;
+ u64 next, last;
+ u32 extent_count;
+ u32 max_extents;
+ int ret;
+
+ /* traverse extent table, freeing each entry that is greater
+ * than inode->i_size;
+ */
+ next = inode->i_ino;
+
+ /* only support truncate -> 0 for now */
+ ret = -EIO;
+ if (inode->i_size != 0)
+ goto out;
+
+ bh = sb_bread(inode->i_sb, clus_to_blk(sbi, next));
+ if (!bh)
+ goto out;
+
+ oe = (struct omfs_extent *)(&bh->b_data[OMFS_EXTENT_START]);
+ max_extents = omfs_max_extents(sbi, OMFS_EXTENT_START);
+
+ for (;;) {
+
+ if (omfs_is_bad(sbi, (struct omfs_header *) bh->b_data, next))
+ goto out_brelse;
+
+ extent_count = be32_to_cpu(oe->e_extent_count);
+
+ if (extent_count > max_extents)
+ goto out_brelse;
+
+ last = next;
+ next = be64_to_cpu(oe->e_next);
+ entry = &oe->e_entry;
+
+ /* ignore last entry as it is the terminator */
+ for (; extent_count > 1; extent_count--) {
+ u64 start, count;
+ start = be64_to_cpu(entry->e_cluster);
+ count = be64_to_cpu(entry->e_blocks);
+
+ omfs_clear_range(inode->i_sb, start, (int) count);
+ entry++;
+ }
+ omfs_make_empty_table(bh, (char *) oe - bh->b_data);
+ mark_buffer_dirty(bh);
+ brelse(bh);
+
+ if (last != inode->i_ino)
+ omfs_clear_range(inode->i_sb, last, sbi->s_mirrors);
+
+ if (next == ~0)
+ break;
+
+ bh = sb_bread(inode->i_sb, clus_to_blk(sbi, next));
+ if (!bh)
+ goto out;
+ oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]);
+ max_extents = omfs_max_extents(sbi, OMFS_EXTENT_CONT);
+ }
+ ret = 0;
+out:
+ return ret;
+out_brelse:
+ brelse(bh);
+ return ret;
+}
+
+static void omfs_truncate(struct inode *inode)
+{
+ omfs_shrink_inode(inode);
+ mark_inode_dirty(inode);
+}
+
+/*
+ * Add new blocks to the current extent, or create new entries/continuations
+ * as necessary.
+ */
+static int omfs_grow_extent(struct inode *inode, struct omfs_extent *oe,
+ u64 *ret_block)
+{
+ struct omfs_extent_entry *terminator;
+ struct omfs_extent_entry *entry = &oe->e_entry;
+ struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb);
+ u32 extent_count = be32_to_cpu(oe->e_extent_count);
+ u64 new_block = 0;
+ u32 max_count;
+ int new_count;
+ int ret = 0;
+
+ /* reached the end of the extent table with no blocks mapped.
+ * there are three possibilities for adding: grow last extent,
+ * add a new extent to the current extent table, and add a
+ * continuation inode. in last two cases need an allocator for
+ * sbi->s_cluster_size
+ */
+
+ /* TODO: handle holes */
+
+ /* should always have a terminator */
+ if (extent_count < 1)
+ return -EIO;
+
+ /* trivially grow current extent, if next block is not taken */
+ terminator = entry + extent_count - 1;
+ if (extent_count > 1) {
+ entry = terminator-1;
+ new_block = be64_to_cpu(entry->e_cluster) +
+ be64_to_cpu(entry->e_blocks);
+
+ if (omfs_allocate_block(inode->i_sb, new_block)) {
+ entry->e_blocks =
+ cpu_to_be64(be64_to_cpu(entry->e_blocks) + 1);
+ terminator->e_blocks = ~(cpu_to_be64(
+ be64_to_cpu(~terminator->e_blocks) + 1));
+ goto out;
+ }
+ }
+ max_count = omfs_max_extents(sbi, OMFS_EXTENT_START);
+
+ /* TODO: add a continuation block here */
+ if (be32_to_cpu(oe->e_extent_count) > max_count-1)
+ return -EIO;
+
+ /* try to allocate a new cluster */
+ ret = omfs_allocate_range(inode->i_sb, 1, sbi->s_clustersize,
+ &new_block, &new_count);
+ if (ret)
+ goto out_fail;
+
+ /* copy terminator down an entry */
+ entry = terminator;
+ terminator++;
+ memcpy(terminator, entry, sizeof(struct omfs_extent_entry));
+
+ entry->e_cluster = cpu_to_be64(new_block);
+ entry->e_blocks = cpu_to_be64((u64) new_count);
+
+ terminator->e_blocks = ~(cpu_to_be64(
+ be64_to_cpu(~terminator->e_blocks) + (u64) new_count));
+
+ /* write in new entry */
+ oe->e_extent_count = cpu_to_be32(1 + be32_to_cpu(oe->e_extent_count));
+
+out:
+ *ret_block = new_block;
+out_fail:
+ return ret;
+}
+
+/*
+ * Scans across the directory table for a given file block number.
+ * If block not found, return 0.
+ */
+static sector_t find_block(struct inode *inode, struct omfs_extent_entry *ent,
+ sector_t block, int count, int *left)
+{
+ /* count > 1 because of terminator */
+ sector_t searched = 0;
+ for (; count > 1; count--) {
+ int numblocks = clus_to_blk(OMFS_SB(inode->i_sb),
+ be64_to_cpu(ent->e_blocks));
+
+ if (block >= searched &&
+ block < searched + numblocks) {
+ /*
+ * found it at cluster + (block - searched)
+ * numblocks - (block - searched) is remainder
+ */
+ *left = numblocks - (block - searched);
+ return clus_to_blk(OMFS_SB(inode->i_sb),
+ be64_to_cpu(ent->e_cluster)) +
+ block - searched;
+ }
+ searched += numblocks;
+ ent++;
+ }
+ return 0;
+}
+
+static int omfs_get_block(struct inode *inode, sector_t block,
+ struct buffer_head *bh_result, int create)
+{
+ struct buffer_head *bh;
+ sector_t next, offset;
+ int ret;
+ u64 new_block;
+ u32 max_extents;
+ int extent_count;
+ struct omfs_extent *oe;
+ struct omfs_extent_entry *entry;
+ struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb);
+ int max_blocks = bh_result->b_size >> inode->i_blkbits;
+ int remain;
+
+ ret = -EIO;
+ bh = sb_bread(inode->i_sb, clus_to_blk(sbi, inode->i_ino));
+ if (!bh)
+ goto out;
+
+ oe = (struct omfs_extent *)(&bh->b_data[OMFS_EXTENT_START]);
+ max_extents = omfs_max_extents(sbi, OMFS_EXTENT_START);
+ next = inode->i_ino;
+
+ for (;;) {
+
+ if (omfs_is_bad(sbi, (struct omfs_header *) bh->b_data, next))
+ goto out_brelse;
+
+ extent_count = be32_to_cpu(oe->e_extent_count);
+ next = be64_to_cpu(oe->e_next);
+ entry = &oe->e_entry;
+
+ if (extent_count > max_extents)
+ goto out_brelse;
+
+ offset = find_block(inode, entry, block, extent_count, &remain);
+ if (offset > 0) {
+ ret = 0;
+ map_bh(bh_result, inode->i_sb, offset);
+ if (remain > max_blocks)
+ remain = max_blocks;
+ bh_result->b_size = (remain << inode->i_blkbits);
+ goto out_brelse;
+ }
+ if (next == ~0)
+ break;
+
+ brelse(bh);
+ bh = sb_bread(inode->i_sb, clus_to_blk(sbi, next));
+ if (!bh)
+ goto out;
+ oe = (struct omfs_extent *) (&bh->b_data[OMFS_EXTENT_CONT]);
+ max_extents = omfs_max_extents(sbi, OMFS_EXTENT_CONT);
+ }
+ if (create) {
+ ret = omfs_grow_extent(inode, oe, &new_block);
+ if (ret == 0) {
+ mark_buffer_dirty(bh);
+ mark_inode_dirty(inode);
+ map_bh(bh_result, inode->i_sb,
+ clus_to_blk(sbi, new_block));
+ }
+ }
+out_brelse:
+ brelse(bh);
+out:
+ return ret;
+}
+
+static int omfs_readpage(struct file *file, struct page *page)
+{
+ return block_read_full_page(page, omfs_get_block);
+}
+
+static int omfs_readpages(struct file *file, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages)
+{
+ return mpage_readpages(mapping, pages, nr_pages, omfs_get_block);
+}
+
+static int omfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+ return block_write_full_page(page, omfs_get_block, wbc);
+}
+
+static int
+omfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+ return mpage_writepages(mapping, wbc, omfs_get_block);
+}
+
+static int omfs_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ *pagep = NULL;
+ return block_write_begin(file, mapping, pos, len, flags,
+ pagep, fsdata, omfs_get_block);
+}
+
+static sector_t omfs_bmap(struct address_space *mapping, sector_t block)
+{
+ return generic_block_bmap(mapping, block, omfs_get_block);
+}
+
+struct file_operations omfs_file_operations = {
+ .llseek = generic_file_llseek,
+ .read = do_sync_read,
+ .write = do_sync_write,
+ .aio_read = generic_file_aio_read,
+ .aio_write = generic_file_aio_write,
+ .mmap = generic_file_mmap,
+ .fsync = omfs_sync_file,
+ .splice_read = generic_file_splice_read,
+};
+
+struct inode_operations omfs_file_inops = {
+ .truncate = omfs_truncate
+};
+
+struct address_space_operations omfs_aops = {
+ .readpage = omfs_readpage,
+ .readpages = omfs_readpages,
+ .writepage = omfs_writepage,
+ .writepages = omfs_writepages,
+ .sync_page = block_sync_page,
+ .write_begin = omfs_write_begin,
+ .write_end = generic_write_end,
+ .bmap = omfs_bmap,
+};
+
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
new file mode 100644
index 00000000000..cbf047a847c
--- /dev/null
+++ b/fs/omfs/inode.c
@@ -0,0 +1,553 @@
+/*
+ * Optimized MPEG FS - inode and super operations.
+ * Copyright (C) 2006 Bob Copeland <me@bobcopeland.com>
+ * Released under GPL v2.
+ */
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/parser.h>
+#include <linux/buffer_head.h>
+#include <linux/vmalloc.h>
+#include <linux/crc-itu-t.h>
+#include "omfs.h"
+
+MODULE_AUTHOR("Bob Copeland <me@bobcopeland.com>");
+MODULE_DESCRIPTION("OMFS (ReplayTV/Karma) Filesystem for Linux");
+MODULE_LICENSE("GPL");
+
+struct inode *omfs_new_inode(struct inode *dir, int mode)
+{
+ struct inode *inode;
+ u64 new_block;
+ int err;
+ int len;
+ struct omfs_sb_info *sbi = OMFS_SB(dir->i_sb);
+
+ inode = new_inode(dir->i_sb);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ err = omfs_allocate_range(dir->i_sb, sbi->s_mirrors, sbi->s_mirrors,
+ &new_block, &len);
+ if (err)
+ goto fail;
+
+ inode->i_ino = new_block;
+ inode->i_mode = mode;
+ inode->i_uid = current->fsuid;
+ inode->i_gid = current->fsgid;
+ inode->i_blocks = 0;
+ inode->i_mapping->a_ops = &omfs_aops;
+
+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ switch (mode & S_IFMT) {
+ case S_IFDIR:
+ inode->i_op = &omfs_dir_inops;
+ inode->i_fop = &omfs_dir_operations;
+ inode->i_size = sbi->s_sys_blocksize;
+ inc_nlink(inode);
+ break;
+ case S_IFREG:
+ inode->i_op = &omfs_file_inops;
+ inode->i_fop = &omfs_file_operations;
+ inode->i_size = 0;
+ break;
+ }
+
+ insert_inode_hash(inode);
+ mark_inode_dirty(inode);
+ return inode;
+fail:
+ make_bad_inode(inode);
+ iput(inode);
+ return ERR_PTR(err);
+}
+
+/*
+ * Update the header checksums for a dirty inode based on its contents.
+ * Caller is expected to hold the buffer head underlying oi and mark it
+ * dirty.
+ */
+static void omfs_update_checksums(struct omfs_inode *oi)
+{
+ int xor, i, ofs = 0, count;
+ u16 crc = 0;
+ unsigned char *ptr = (unsigned char *) oi;
+
+ count = be32_to_cpu(oi->i_head.h_body_size);
+ ofs = sizeof(struct omfs_header);
+
+ crc = crc_itu_t(crc, ptr + ofs, count);
+ oi->i_head.h_crc = cpu_to_be16(crc);
+
+ xor = ptr[0];
+ for (i = 1; i < OMFS_XOR_COUNT; i++)
+ xor ^= ptr[i];
+
+ oi->i_head.h_check_xor = xor;
+}
+
+static int omfs_write_inode(struct inode *inode, int wait)
+{
+ struct omfs_inode *oi;
+ struct omfs_sb_info *sbi = OMFS_SB(inode->i_sb);
+ struct buffer_head *bh, *bh2;
+ unsigned int block;
+ u64 ctime;
+ int i;
+ int ret = -EIO;
+ int sync_failed = 0;
+
+ /* get current inode since we may have written sibling ptrs etc. */
+ block = clus_to_blk(sbi, inode->i_ino);
+ bh = sb_bread(inode->i_sb, block);
+ if (!bh)
+ goto out;
+
+ oi = (struct omfs_inode *) bh->b_data;
+
+ oi->i_head.h_self = cpu_to_be64(inode->i_ino);
+ if (S_ISDIR(inode->i_mode))
+ oi->i_type = OMFS_DIR;
+ else if (S_ISREG(inode->i_mode))
+ oi->i_type = OMFS_FILE;
+ else {
+ printk(KERN_WARNING "omfs: unknown file type: %d\n",
+ inode->i_mode);
+ goto out_brelse;
+ }
+
+ oi->i_head.h_body_size = cpu_to_be32(sbi->s_sys_blocksize -
+ sizeof(struct omfs_header));
+ oi->i_head.h_version = 1;
+ oi->i_head.h_type = OMFS_INODE_NORMAL;
+ oi->i_head.h_magic = OMFS_IMAGIC;
+ oi->i_size = cpu_to_be64(inode->i_size);
+
+ ctime = inode->i_ctime.tv_sec * 1000LL +
+ ((inode->i_ctime.tv_nsec + 999)/1000);
+ oi->i_ctime = cpu_to_be64(ctime);
+
+ omfs_update_checksums(oi);
+
+ mark_buffer_dirty(bh);
+ if (wait) {
+ sync_dirty_buffer(bh);
+ if (buffer_req(bh) && !buffer_uptodate(bh))
+ sync_failed = 1;
+ }
+
+ /* if mirroring writes, copy to next fsblock */
+ for (i = 1; i < sbi->s_mirrors; i++) {
+ bh2 = sb_bread(inode->i_sb, block + i *
+ (sbi->s_blocksize / sbi->s_sys_blocksize));
+ if (!bh2)
+ goto out_brelse;
+
+ memcpy(bh2->b_data, bh->b_data, bh->b_size);
+ mark_buffer_dirty(bh2);
+ if (wait) {
+ sync_dirty_buffer(bh2);
+ if (buffer_req(bh2) && !buffer_uptodate(bh2))
+ sync_failed = 1;
+ }
+ brelse(bh2);
+ }
+ ret = (sync_failed) ? -EIO : 0;
+out_brelse:
+ brelse(bh);
+out:
+ return ret;
+}
+
+int omfs_sync_inode(struct inode *inode)
+{
+ return omfs_write_inode(inode, 1);
+}
+
+/*
+ * called when an entry is deleted, need to clear the bits in the
+ * bitmaps.
+ */
+static void omfs_delete_inode(struct inode *inode)
+{
+ truncate_inode_pages(&inode->i_data, 0);
+
+ if (S_ISREG(inode->i_mode)) {
+ inode->i_size = 0;
+ omfs_shrink_inode(inode);
+ }
+
+ omfs_clear_range(inode->i_sb, inode->i_ino, 2);
+ clear_inode(inode);
+}
+
+struct inode *omfs_iget(struct super_block *sb, ino_t ino)
+{
+ struct omfs_sb_info *sbi = OMFS_SB(sb);
+ struct omfs_inode *oi;
+ struct buffer_head *bh;
+ unsigned int block;
+ u64 ctime;
+ unsigned long nsecs;
+ struct inode *inode;
+
+ inode = iget_locked(sb, ino);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+ if (!(inode->i_state & I_NEW))
+ return inode;
+
+ block = clus_to_blk(sbi, ino);
+ bh = sb_bread(inode->i_sb, block);
+ if (!bh)
+ goto iget_failed;
+
+ oi = (struct omfs_inode *)bh->b_data;
+
+ /* check self */
+ if (ino != be64_to_cpu(oi->i_head.h_self))
+ goto fail_bh;
+
+ inode->i_uid = sbi->s_uid;
+ inode->i_gid = sbi->s_gid;
+
+ ctime = be64_to_cpu(oi->i_ctime);
+ nsecs = do_div(ctime, 1000) * 1000L;
+
+ inode->i_atime.tv_sec = ctime;
+ inode->i_mtime.tv_sec = ctime;
+ inode->i_ctime.tv_sec = ctime;
+ inode->i_atime.tv_nsec = nsecs;
+ inode->i_mtime.tv_nsec = nsecs;
+ inode->i_ctime.tv_nsec = nsecs;
+
+ inode->i_mapping->a_ops = &omfs_aops;
+
+ switch (oi->i_type) {
+ case OMFS_DIR:
+ inode->i_mode = S_IFDIR | (S_IRWXUGO & ~sbi->s_dmask);
+ inode->i_op = &omfs_dir_inops;
+ inode->i_fop = &omfs_dir_operations;
+ inode->i_size = sbi->s_sys_blocksize;
+ inc_nlink(inode);
+ break;
+ case OMFS_FILE:
+ inode->i_mode = S_IFREG | (S_IRWXUGO & ~sbi->s_fmask);
+ inode->i_fop = &omfs_file_operations;
+ inode->i_size = be64_to_cpu(oi->i_size);
+ break;
+ }
+ brelse(bh);
+ unlock_new_inode(inode);
+ return inode;
+fail_bh:
+ brelse(bh);
+iget_failed:
+ iget_failed(inode);
+ return ERR_PTR(-EIO);
+}
+
+static void omfs_put_super(struct super_block *sb)
+{
+ struct omfs_sb_info *sbi = OMFS_SB(sb);
+ kfree(sbi->s_imap);
+ kfree(sbi);
+ sb->s_fs_info = NULL;
+}
+
+static int omfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ struct super_block *s = dentry->d_sb;
+ struct omfs_sb_info *sbi = OMFS_SB(s);
+ buf->f_type = OMFS_MAGIC;
+ buf->f_bsize = sbi->s_blocksize;
+ buf->f_blocks = sbi->s_num_blocks;
+ buf->f_files = sbi->s_num_blocks;
+ buf->f_namelen = OMFS_NAMELEN;
+
+ buf->f_bfree = buf->f_bavail = buf->f_ffree =
+ omfs_count_free(s);
+ return 0;
+}
+
+static struct super_operations omfs_sops = {
+ .write_inode = omfs_write_inode,
+ .delete_inode = omfs_delete_inode,
+ .put_super = omfs_put_super,
+ .statfs = omfs_statfs,
+ .show_options = generic_show_options,
+};
+
+/*
+ * For Rio Karma, there is an on-disk free bitmap whose location is
+ * stored in the root block. For ReplayTV, there is no such free bitmap
+ * so we have to walk the tree. Both inodes and file data are allocated
+ * from the same map. This array can be big (300k) so we allocate
+ * in units of the blocksize.
+ */
+static int omfs_get_imap(struct super_block *sb)
+{
+ int bitmap_size;
+ int array_size;
+ int count;
+ struct omfs_sb_info *sbi = OMFS_SB(sb);
+ struct buffer_head *bh;
+ unsigned long **ptr;
+ sector_t block;
+
+ bitmap_size = DIV_ROUND_UP(sbi->s_num_blocks, 8);
+ array_size = DIV_ROUND_UP(bitmap_size, sb->s_blocksize);
+
+ if (sbi->s_bitmap_ino == ~0ULL)
+ goto out;
+
+ sbi->s_imap_size = array_size;
+ sbi->s_imap = kzalloc(array_size * sizeof(unsigned long *), GFP_KERNEL);
+ if (!sbi->s_imap)
+ goto nomem;
+
+ block = clus_to_blk(sbi, sbi->s_bitmap_ino);
+ ptr = sbi->s_imap;
+ for (count = bitmap_size; count > 0; count -= sb->s_blocksize) {
+ bh = sb_bread(sb, block++);
+ if (!bh)
+ goto nomem_free;
+ *ptr = kmalloc(sb->s_blocksize, GFP_KERNEL);
+ if (!*ptr) {
+ brelse(bh);
+ goto nomem_free;
+ }
+ memcpy(*ptr, bh->b_data, sb->s_blocksize);
+ if (count < sb->s_blocksize)
+ memset((void *)*ptr + count, 0xff,
+ sb->s_blocksize - count);
+ brelse(bh);
+ ptr++;
+ }
+out:
+ return 0;
+
+nomem_free:
+ for (count = 0; count < array_size; count++)
+ kfree(sbi->s_imap[count]);
+
+ kfree(sbi->s_imap);
+nomem:
+ sbi->s_imap = NULL;
+ sbi->s_imap_size = 0;
+ return -ENOMEM;
+}
+
+enum {
+ Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask
+};
+
+static const match_table_t tokens = {
+ {Opt_uid, "uid=%u"},
+ {Opt_gid, "gid=%u"},
+ {Opt_umask, "umask=%o"},
+ {Opt_dmask, "dmask=%o"},
+ {Opt_fmask, "fmask=%o"},
+};
+
+static int parse_options(char *options, struct omfs_sb_info *sbi)
+{
+ char *p;
+ substring_t args[MAX_OPT_ARGS];
+ int option;
+
+ if (!options)
+ return 1;
+
+ while ((p = strsep(&options, ",")) != NULL) {
+ int token;
+ if (!*p)
+ continue;
+
+ token = match_token(p, tokens, args);
+ switch (token) {
+ case Opt_uid:
+ if (match_int(&args[0], &option))
+ return 0;
+ sbi->s_uid = option;
+ break;
+ case Opt_gid:
+ if (match_int(&args[0], &option))
+ return 0;
+ sbi->s_gid = option;
+ break;
+ case Opt_umask:
+ if (match_octal(&args[0], &option))
+ return 0;
+ sbi->s_fmask = sbi->s_dmask = option;
+ break;
+ case Opt_dmask:
+ if (match_octal(&args[0], &option))
+ return 0;
+ sbi->s_dmask = option;
+ break;
+ case Opt_fmask:
+ if (match_octal(&args[0], &option))
+ return 0;
+ sbi->s_fmask = option;
+ break;
+ default:
+ return 0;
+ }
+ }
+ return 1;
+}
+
+static int omfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+ struct buffer_head *bh, *bh2;
+ struct omfs_super_block *omfs_sb;
+ struct omfs_root_block *omfs_rb;
+ struct omfs_sb_info *sbi;
+ struct inode *root;
+ sector_t start;
+ int ret = -EINVAL;
+
+ save_mount_options(sb, (char *) data);
+
+ sbi = kzalloc(sizeof(struct omfs_sb_info), GFP_KERNEL);
+ if (!sbi)
+ return -ENOMEM;
+
+ sb->s_fs_info = sbi;
+
+ sbi->s_uid = current->uid;
+ sbi->s_gid = current->gid;
+ sbi->s_dmask = sbi->s_fmask = current->fs->umask;
+
+ if (!parse_options((char *) data, sbi))
+ goto end;
+
+ sb->s_maxbytes = 0xffffffff;
+
+ sb_set_blocksize(sb, 0x200);
+
+ bh = sb_bread(sb, 0);
+ if (!bh)
+ goto end;
+
+ omfs_sb = (struct omfs_super_block *)bh->b_data;
+
+ if (omfs_sb->s_magic != cpu_to_be32(OMFS_MAGIC)) {
+ if (!silent)
+ printk(KERN_ERR "omfs: Invalid superblock (%x)\n",
+ omfs_sb->s_magic);
+ goto out_brelse_bh;
+ }
+ sb->s_magic = OMFS_MAGIC;
+
+ sbi->s_num_blocks = be64_to_cpu(omfs_sb->s_num_blocks);
+ sbi->s_blocksize = be32_to_cpu(omfs_sb->s_blocksize);
+ sbi->s_mirrors = be32_to_cpu(omfs_sb->s_mirrors);
+ sbi->s_root_ino = be64_to_cpu(omfs_sb->s_root_block);
+ sbi->s_sys_blocksize = be32_to_cpu(omfs_sb->s_sys_blocksize);
+ mutex_init(&sbi->s_bitmap_lock);
+
+ if (sbi->s_sys_blocksize > PAGE_SIZE) {
+ printk(KERN_ERR "omfs: sysblock size (%d) is out of range\n",
+ sbi->s_sys_blocksize);
+ goto out_brelse_bh;
+ }
+
+ if (sbi->s_blocksize < sbi->s_sys_blocksize ||
+ sbi->s_blocksize > OMFS_MAX_BLOCK_SIZE) {
+ printk(KERN_ERR "omfs: block size (%d) is out of range\n",
+ sbi->s_blocksize);
+ goto out_brelse_bh;
+ }
+
+ /*
+ * Use sys_blocksize as the fs block since it is smaller than a
+ * page while the fs blocksize can be larger.
+ */
+ sb_set_blocksize(sb, sbi->s_sys_blocksize);
+
+ /*
+ * ...and the difference goes into a shift. sys_blocksize is always
+ * a power of two factor of blocksize.
+ */
+ sbi->s_block_shift = get_bitmask_order(sbi->s_blocksize) -
+ get_bitmask_order(sbi->s_sys_blocksize);
+
+ start = clus_to_blk(sbi, be64_to_cpu(omfs_sb->s_root_block));
+ bh2 = sb_bread(sb, start);
+ if (!bh2)
+ goto out_brelse_bh;
+
+ omfs_rb = (struct omfs_root_block *)bh2->b_data;
+
+ sbi->s_bitmap_ino = be64_to_cpu(omfs_rb->r_bitmap);
+ sbi->s_clustersize = be32_to_cpu(omfs_rb->r_clustersize);
+
+ if (sbi->s_num_blocks != be64_to_cpu(omfs_rb->r_num_blocks)) {
+ printk(KERN_ERR "omfs: block count discrepancy between "
+ "super and root blocks (%llx, %llx)\n",
+ (unsigned long long)sbi->s_num_blocks,
+ (unsigned long long)be64_to_cpu(omfs_rb->r_num_blocks));
+ goto out_brelse_bh2;
+ }
+
+ ret = omfs_get_imap(sb);
+ if (ret)
+ goto out_brelse_bh2;
+
+ sb->s_op = &omfs_sops;
+
+ root = omfs_iget(sb, be64_to_cpu(omfs_rb->r_root_dir));
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
+ goto out_brelse_bh2;
+ }
+
+ sb->s_root = d_alloc_root(root);
+ if (!sb->s_root) {
+ iput(root);
+ goto out_brelse_bh2;
+ }
+ printk(KERN_DEBUG "omfs: Mounted volume %s\n", omfs_rb->r_name);
+
+ ret = 0;
+out_brelse_bh2:
+ brelse(bh2);
+out_brelse_bh:
+ brelse(bh);
+end:
+ return ret;
+}
+
+static int omfs_get_sb(struct file_system_type *fs_type,
+ int flags, const char *dev_name,
+ void *data, struct vfsmount *m)
+{
+ return get_sb_bdev(fs_type, flags, dev_name, data, omfs_fill_super, m);
+}
+
+static struct file_system_type omfs_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "omfs",
+ .get_sb = omfs_get_sb,
+ .kill_sb = kill_block_super,
+ .fs_flags = FS_REQUIRES_DEV,
+};
+
+static int __init init_omfs_fs(void)
+{
+ return register_filesystem(&omfs_fs_type);
+}
+
+static void __exit exit_omfs_fs(void)
+{
+ unregister_filesystem(&omfs_fs_type);
+}
+
+module_init(init_omfs_fs);
+module_exit(exit_omfs_fs);
diff --git a/fs/omfs/omfs.h b/fs/omfs/omfs.h
new file mode 100644
index 00000000000..2bc0f067040
--- /dev/null
+++ b/fs/omfs/omfs.h
@@ -0,0 +1,67 @@
+#ifndef _OMFS_H
+#define _OMFS_H
+
+#include <linux/module.h>
+#include <linux/fs.h>
+
+#include "omfs_fs.h"
+
+/* In-memory structures */
+struct omfs_sb_info {
+ u64 s_num_blocks;
+ u64 s_bitmap_ino;
+ u64 s_root_ino;
+ u32 s_blocksize;
+ u32 s_mirrors;
+ u32 s_sys_blocksize;
+ u32 s_clustersize;
+ int s_block_shift;
+ unsigned long **s_imap;
+ int s_imap_size;
+ struct mutex s_bitmap_lock;
+ int s_uid;
+ int s_gid;
+ int s_dmask;
+ int s_fmask;
+};
+
+/* convert a cluster number to a scaled block number */
+static inline sector_t clus_to_blk(struct omfs_sb_info *sbi, sector_t block)
+{
+ return block << sbi->s_block_shift;
+}
+
+static inline struct omfs_sb_info *OMFS_SB(struct super_block *sb)
+{
+ return sb->s_fs_info;
+}
+
+/* bitmap.c */
+extern unsigned long omfs_count_free(struct super_block *sb);
+extern int omfs_allocate_block(struct super_block *sb, u64 block);
+extern int omfs_allocate_range(struct super_block *sb, int min_request,
+ int max_request, u64 *return_block, int *return_size);
+extern int omfs_clear_range(struct super_block *sb, u64 block, int count);
+
+/* dir.c */
+extern struct file_operations omfs_dir_operations;
+extern struct inode_operations omfs_dir_inops;
+extern int omfs_make_empty(struct inode *inode, struct super_block *sb);
+extern int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header,
+ u64 fsblock);
+
+/* file.c */
+extern struct file_operations omfs_file_operations;
+extern struct inode_operations omfs_file_inops;
+extern struct address_space_operations omfs_aops;
+extern void omfs_make_empty_table(struct buffer_head *bh, int offset);
+extern int omfs_shrink_inode(struct inode *inode);
+
+/* inode.c */
+extern struct inode *omfs_iget(struct super_block *sb, ino_t inode);
+extern struct inode *omfs_new_inode(struct inode *dir, int mode);
+extern int omfs_reserve_block(struct super_block *sb, sector_t block);
+extern int omfs_find_empty_block(struct super_block *sb, int mode, ino_t *ino);
+extern int omfs_sync_inode(struct inode *inode);
+
+#endif
diff --git a/fs/omfs/omfs_fs.h b/fs/omfs/omfs_fs.h
new file mode 100644
index 00000000000..12cca245d6e
--- /dev/null
+++ b/fs/omfs/omfs_fs.h
@@ -0,0 +1,80 @@
+#ifndef _OMFS_FS_H
+#define _OMFS_FS_H
+
+/* OMFS On-disk structures */
+
+#define OMFS_MAGIC 0xC2993D87
+#define OMFS_IMAGIC 0xD2
+
+#define OMFS_DIR 'D'
+#define OMFS_FILE 'F'
+#define OMFS_INODE_NORMAL 'e'
+#define OMFS_INODE_CONTINUATION 'c'
+#define OMFS_INODE_SYSTEM 's'
+#define OMFS_NAMELEN 256
+#define OMFS_DIR_START 0x1b8
+#define OMFS_EXTENT_START 0x1d0
+#define OMFS_EXTENT_CONT 0x40
+#define OMFS_XOR_COUNT 19
+#define OMFS_MAX_BLOCK_SIZE 8192
+
+struct omfs_super_block {
+ char s_fill1[256];
+ __be64 s_root_block; /* block number of omfs_root_block */
+ __be64 s_num_blocks; /* total number of FS blocks */
+ __be32 s_magic; /* OMFS_MAGIC */
+ __be32 s_blocksize; /* size of a block */
+ __be32 s_mirrors; /* # of mirrors of system blocks */
+ __be32 s_sys_blocksize; /* size of non-data blocks */
+};
+
+struct omfs_header {
+ __be64 h_self; /* FS block where this is located */
+ __be32 h_body_size; /* size of useful data after header */
+ __be16 h_crc; /* crc-ccitt of body_size bytes */
+ char h_fill1[2];
+ u8 h_version; /* version, always 1 */
+ char h_type; /* OMFS_INODE_X */
+ u8 h_magic; /* OMFS_IMAGIC */
+ u8 h_check_xor; /* XOR of header bytes before this */
+ __be32 h_fill2;
+};
+
+struct omfs_root_block {
+ struct omfs_header r_head; /* header */
+ __be64 r_fill1;
+ __be64 r_num_blocks; /* total number of FS blocks */
+ __be64 r_root_dir; /* block # of root directory */
+ __be64 r_bitmap; /* block # of free space bitmap */
+ __be32 r_blocksize; /* size of a block */
+ __be32 r_clustersize; /* size allocated for data blocks */
+ __be64 r_mirrors; /* # of mirrors of system blocks */
+ char r_name[OMFS_NAMELEN]; /* partition label */
+};
+
+struct omfs_inode {
+ struct omfs_header i_head; /* header */
+ __be64 i_parent; /* parent containing this inode */
+ __be64 i_sibling; /* next inode in hash bucket */
+ __be64 i_ctime; /* ctime, in milliseconds */
+ char i_fill1[35];
+ char i_type; /* OMFS_[DIR,FILE] */
+ __be32 i_fill2;
+ char i_fill3[64];
+ char i_name[OMFS_NAMELEN]; /* filename */
+ __be64 i_size; /* size of file, in bytes */
+};
+
+struct omfs_extent_entry {
+ __be64 e_cluster; /* start location of a set of blocks */
+ __be64 e_blocks; /* number of blocks after e_cluster */
+};
+
+struct omfs_extent {
+ __be64 e_next; /* next extent table location */
+ __be32 e_extent_count; /* total # extents in this table */
+ __be32 e_fill;
+ struct omfs_extent_entry e_entry; /* start of extent entries */
+};
+
+#endif
diff --git a/fs/open.c b/fs/open.c
index a1450086e92..5596049863b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -16,6 +16,7 @@
#include <linux/namei.h>
#include <linux/backing-dev.h>
#include <linux/capability.h>
+#include <linux/securebits.h>
#include <linux/security.h>
#include <linux/mount.h>
#include <linux/vfs.h>
@@ -63,7 +64,8 @@ static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf)
memcpy(buf, &st, sizeof(st));
else {
if (sizeof buf->f_blocks == 4) {
- if ((st.f_blocks | st.f_bfree | st.f_bavail) &
+ if ((st.f_blocks | st.f_bfree | st.f_bavail |
+ st.f_bsize | st.f_frsize) &
0xffffffff00000000ULL)
return -EOVERFLOW;
/*
@@ -120,37 +122,37 @@ static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf)
return 0;
}
-asmlinkage long sys_statfs(const char __user * path, struct statfs __user * buf)
+asmlinkage long sys_statfs(const char __user *pathname, struct statfs __user * buf)
{
- struct nameidata nd;
+ struct path path;
int error;
- error = user_path_walk(path, &nd);
+ error = user_path(pathname, &path);
if (!error) {
struct statfs tmp;
- error = vfs_statfs_native(nd.path.dentry, &tmp);
+ error = vfs_statfs_native(path.dentry, &tmp);
if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
error = -EFAULT;
- path_put(&nd.path);
+ path_put(&path);
}
return error;
}
-asmlinkage long sys_statfs64(const char __user *path, size_t sz, struct statfs64 __user *buf)
+asmlinkage long sys_statfs64(const char __user *pathname, size_t sz, struct statfs64 __user *buf)
{
- struct nameidata nd;
+ struct path path;
long error;
if (sz != sizeof(*buf))
return -EINVAL;
- error = user_path_walk(path, &nd);
+ error = user_path(pathname, &path);
if (!error) {
struct statfs64 tmp;
- error = vfs_statfs64(nd.path.dentry, &tmp);
+ error = vfs_statfs64(path.dentry, &tmp);
if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
error = -EFAULT;
- path_put(&nd.path);
+ path_put(&path);
}
return error;
}
@@ -221,20 +223,20 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
return err;
}
-static long do_sys_truncate(const char __user * path, loff_t length)
+static long do_sys_truncate(const char __user *pathname, loff_t length)
{
- struct nameidata nd;
- struct inode * inode;
+ struct path path;
+ struct inode *inode;
int error;
error = -EINVAL;
if (length < 0) /* sorry, but loff_t says... */
goto out;
- error = user_path_walk(path, &nd);
+ error = user_path(pathname, &path);
if (error)
goto out;
- inode = nd.path.dentry->d_inode;
+ inode = path.dentry->d_inode;
/* For directories it's -EISDIR, for other non-regulars - -EINVAL */
error = -EISDIR;
@@ -245,16 +247,16 @@ static long do_sys_truncate(const char __user * path, loff_t length)
if (!S_ISREG(inode->i_mode))
goto dput_and_out;
- error = mnt_want_write(nd.path.mnt);
+ error = mnt_want_write(path.mnt);
if (error)
goto dput_and_out;
- error = vfs_permission(&nd, MAY_WRITE);
+ error = inode_permission(inode, MAY_WRITE);
if (error)
goto mnt_drop_write_and_out;
error = -EPERM;
- if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+ if (IS_APPEND(inode))
goto mnt_drop_write_and_out;
error = get_write_access(inode);
@@ -272,15 +274,15 @@ static long do_sys_truncate(const char __user * path, loff_t length)
error = locks_verify_truncate(inode, NULL, length);
if (!error) {
DQUOT_INIT(inode);
- error = do_truncate(nd.path.dentry, length, 0, NULL);
+ error = do_truncate(path.dentry, length, 0, NULL);
}
put_write_and_out:
put_write_access(inode);
mnt_drop_write_and_out:
- mnt_drop_write(nd.path.mnt);
+ mnt_drop_write(path.mnt);
dput_and_out:
- path_put(&nd.path);
+ path_put(&path);
out:
return error;
}
@@ -423,9 +425,10 @@ out:
*/
asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
{
- struct nameidata nd;
+ struct path path;
+ struct inode *inode;
int old_fsuid, old_fsgid;
- kernel_cap_t old_cap;
+ kernel_cap_t uninitialized_var(old_cap); /* !SECURE_NO_SETUID_FIXUP */
int res;
if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */
@@ -433,32 +436,47 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
old_fsuid = current->fsuid;
old_fsgid = current->fsgid;
- old_cap = current->cap_effective;
current->fsuid = current->uid;
current->fsgid = current->gid;
- /*
- * Clear the capabilities if we switch to a non-root user
- *
- * FIXME: There is a race here against sys_capset. The
- * capabilities can change yet we will restore the old
- * value below. We should hold task_capabilities_lock,
- * but we cannot because user_path_walk can sleep.
- */
- if (current->uid)
- cap_clear(current->cap_effective);
- else
- current->cap_effective = current->cap_permitted;
+ if (!issecure(SECURE_NO_SETUID_FIXUP)) {
+ /*
+ * Clear the capabilities if we switch to a non-root user
+ */
+#ifndef CONFIG_SECURITY_FILE_CAPABILITIES
+ /*
+ * FIXME: There is a race here against sys_capset. The
+ * capabilities can change yet we will restore the old
+ * value below. We should hold task_capabilities_lock,
+ * but we cannot because user_path_at can sleep.
+ */
+#endif /* ndef CONFIG_SECURITY_FILE_CAPABILITIES */
+ if (current->uid)
+ old_cap = cap_set_effective(__cap_empty_set);
+ else
+ old_cap = cap_set_effective(current->cap_permitted);
+ }
- res = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd);
+ res = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path);
if (res)
goto out;
- res = vfs_permission(&nd, mode);
+ inode = path.dentry->d_inode;
+
+ if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) {
+ /*
+ * MAY_EXEC on regular files is denied if the fs is mounted
+ * with the "noexec" flag.
+ */
+ res = -EACCES;
+ if (path.mnt->mnt_flags & MNT_NOEXEC)
+ goto out_path_release;
+ }
+
+ res = inode_permission(inode, mode | MAY_ACCESS);
/* SuS v2 requires we report a read only fs too */
- if(res || !(mode & S_IWOTH) ||
- special_file(nd.path.dentry->d_inode->i_mode))
+ if (res || !(mode & S_IWOTH) || special_file(inode->i_mode))
goto out_path_release;
/*
* This is a rare case where using __mnt_is_readonly()
@@ -470,15 +488,17 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
* inherently racy and know that the fs may change
* state before we even see this result.
*/
- if (__mnt_is_readonly(nd.path.mnt))
+ if (__mnt_is_readonly(path.mnt))
res = -EROFS;
out_path_release:
- path_put(&nd.path);
+ path_put(&path);
out:
current->fsuid = old_fsuid;
current->fsgid = old_fsgid;
- current->cap_effective = old_cap;
+
+ if (!issecure(SECURE_NO_SETUID_FIXUP))
+ cap_set_effective(old_cap);
return res;
}
@@ -490,22 +510,21 @@ asmlinkage long sys_access(const char __user *filename, int mode)
asmlinkage long sys_chdir(const char __user * filename)
{
- struct nameidata nd;
+ struct path path;
int error;
- error = __user_walk(filename,
- LOOKUP_FOLLOW|LOOKUP_DIRECTORY|LOOKUP_CHDIR, &nd);
+ error = user_path_dir(filename, &path);
if (error)
goto out;
- error = vfs_permission(&nd, MAY_EXEC);
+ error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS);
if (error)
goto dput_and_out;
- set_fs_pwd(current->fs, &nd.path);
+ set_fs_pwd(current->fs, &path);
dput_and_out:
- path_put(&nd.path);
+ path_put(&path);
out:
return error;
}
@@ -527,7 +546,7 @@ asmlinkage long sys_fchdir(unsigned int fd)
if (!S_ISDIR(inode->i_mode))
goto out_putf;
- error = file_permission(file, MAY_EXEC);
+ error = inode_permission(inode, MAY_EXEC | MAY_ACCESS);
if (!error)
set_fs_pwd(current->fs, &file->f_path);
out_putf:
@@ -538,14 +557,14 @@ out:
asmlinkage long sys_chroot(const char __user * filename)
{
- struct nameidata nd;
+ struct path path;
int error;
- error = __user_walk(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd);
+ error = user_path_dir(filename, &path);
if (error)
goto out;
- error = vfs_permission(&nd, MAY_EXEC);
+ error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_ACCESS);
if (error)
goto dput_and_out;
@@ -553,11 +572,10 @@ asmlinkage long sys_chroot(const char __user * filename)
if (!capable(CAP_SYS_CHROOT))
goto dput_and_out;
- set_fs_root(current->fs, &nd.path);
- set_fs_altroot();
+ set_fs_root(current->fs, &path);
error = 0;
dput_and_out:
- path_put(&nd.path);
+ path_put(&path);
out:
return error;
}
@@ -582,9 +600,6 @@ asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
err = mnt_want_write(file->f_path.mnt);
if (err)
goto out_putf;
- err = -EPERM;
- if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
- goto out_drop_write;
mutex_lock(&inode->i_mutex);
if (mode == (mode_t) -1)
mode = inode->i_mode;
@@ -592,8 +607,6 @@ asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
err = notify_change(dentry, &newattrs);
mutex_unlock(&inode->i_mutex);
-
-out_drop_write:
mnt_drop_write(file->f_path.mnt);
out_putf:
fput(file);
@@ -604,36 +617,29 @@ out:
asmlinkage long sys_fchmodat(int dfd, const char __user *filename,
mode_t mode)
{
- struct nameidata nd;
- struct inode * inode;
+ struct path path;
+ struct inode *inode;
int error;
struct iattr newattrs;
- error = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW, &nd);
+ error = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path);
if (error)
goto out;
- inode = nd.path.dentry->d_inode;
+ inode = path.dentry->d_inode;
- error = mnt_want_write(nd.path.mnt);
+ error = mnt_want_write(path.mnt);
if (error)
goto dput_and_out;
-
- error = -EPERM;
- if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
- goto out_drop_write;
-
mutex_lock(&inode->i_mutex);
if (mode == (mode_t) -1)
mode = inode->i_mode;
newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
- error = notify_change(nd.path.dentry, &newattrs);
+ error = notify_change(path.dentry, &newattrs);
mutex_unlock(&inode->i_mutex);
-
-out_drop_write:
- mnt_drop_write(nd.path.mnt);
+ mnt_drop_write(path.mnt);
dput_and_out:
- path_put(&nd.path);
+ path_put(&path);
out:
return error;
}
@@ -645,18 +651,10 @@ asmlinkage long sys_chmod(const char __user *filename, mode_t mode)
static int chown_common(struct dentry * dentry, uid_t user, gid_t group)
{
- struct inode * inode;
+ struct inode *inode = dentry->d_inode;
int error;
struct iattr newattrs;
- error = -ENOENT;
- if (!(inode = dentry->d_inode)) {
- printk(KERN_ERR "chown_common: NULL inode\n");
- goto out;
- }
- error = -EPERM;
- if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
- goto out;
newattrs.ia_valid = ATTR_CTIME;
if (user != (uid_t) -1) {
newattrs.ia_valid |= ATTR_UID;
@@ -672,25 +670,25 @@ static int chown_common(struct dentry * dentry, uid_t user, gid_t group)
mutex_lock(&inode->i_mutex);
error = notify_change(dentry, &newattrs);
mutex_unlock(&inode->i_mutex);
-out:
+
return error;
}
asmlinkage long sys_chown(const char __user * filename, uid_t user, gid_t group)
{
- struct nameidata nd;
+ struct path path;
int error;
- error = user_path_walk(filename, &nd);
+ error = user_path(filename, &path);
if (error)
goto out;
- error = mnt_want_write(nd.path.mnt);
+ error = mnt_want_write(path.mnt);
if (error)
goto out_release;
- error = chown_common(nd.path.dentry, user, group);
- mnt_drop_write(nd.path.mnt);
+ error = chown_common(path.dentry, user, group);
+ mnt_drop_write(path.mnt);
out_release:
- path_put(&nd.path);
+ path_put(&path);
out:
return error;
}
@@ -698,7 +696,7 @@ out:
asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user,
gid_t group, int flag)
{
- struct nameidata nd;
+ struct path path;
int error = -EINVAL;
int follow;
@@ -706,35 +704,35 @@ asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user,
goto out;
follow = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
- error = __user_walk_fd(dfd, filename, follow, &nd);
+ error = user_path_at(dfd, filename, follow, &path);
if (error)
goto out;
- error = mnt_want_write(nd.path.mnt);
+ error = mnt_want_write(path.mnt);
if (error)
goto out_release;
- error = chown_common(nd.path.dentry, user, group);
- mnt_drop_write(nd.path.mnt);
+ error = chown_common(path.dentry, user, group);
+ mnt_drop_write(path.mnt);
out_release:
- path_put(&nd.path);
+ path_put(&path);
out:
return error;
}
asmlinkage long sys_lchown(const char __user * filename, uid_t user, gid_t group)
{
- struct nameidata nd;
+ struct path path;
int error;
- error = user_path_walk_link(filename, &nd);
+ error = user_lpath(filename, &path);
if (error)
goto out;
- error = mnt_want_write(nd.path.mnt);
+ error = mnt_want_write(path.mnt);
if (error)
goto out_release;
- error = chown_common(nd.path.dentry, user, group);
- mnt_drop_write(nd.path.mnt);
+ error = chown_common(path.dentry, user, group);
+ mnt_drop_write(path.mnt);
out_release:
- path_put(&nd.path);
+ path_put(&path);
out:
return error;
}
@@ -965,71 +963,6 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
}
EXPORT_SYMBOL(dentry_open);
-/*
- * Find an empty file descriptor entry, and mark it busy.
- */
-int get_unused_fd_flags(int flags)
-{
- struct files_struct * files = current->files;
- int fd, error;
- struct fdtable *fdt;
-
- error = -EMFILE;
- spin_lock(&files->file_lock);
-
-repeat:
- fdt = files_fdtable(files);
- fd = find_next_zero_bit(fdt->open_fds->fds_bits, fdt->max_fds,
- files->next_fd);
-
- /*
- * N.B. For clone tasks sharing a files structure, this test
- * will limit the total number of files that can be opened.
- */
- if (fd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
- goto out;
-
- /* Do we need to expand the fd array or fd set? */
- error = expand_files(files, fd);
- if (error < 0)
- goto out;
-
- if (error) {
- /*
- * If we needed to expand the fs array we
- * might have blocked - try again.
- */
- error = -EMFILE;
- goto repeat;
- }
-
- FD_SET(fd, fdt->open_fds);
- if (flags & O_CLOEXEC)
- FD_SET(fd, fdt->close_on_exec);
- else
- FD_CLR(fd, fdt->close_on_exec);
- files->next_fd = fd + 1;
-#if 1
- /* Sanity check */
- if (fdt->fd[fd] != NULL) {
- printk(KERN_WARNING "get_unused_fd: slot %d not NULL!\n", fd);
- fdt->fd[fd] = NULL;
- }
-#endif
- error = fd;
-
-out:
- spin_unlock(&files->file_lock);
- return error;
-}
-
-int get_unused_fd(void)
-{
- return get_unused_fd_flags(0);
-}
-
-EXPORT_SYMBOL(get_unused_fd);
-
static void __put_unused_fd(struct files_struct *files, unsigned int fd)
{
struct fdtable *fdt = files_fdtable(files);
@@ -1208,8 +1141,7 @@ EXPORT_SYMBOL(sys_close);
asmlinkage long sys_vhangup(void)
{
if (capable(CAP_SYS_TTY_CONFIG)) {
- /* XXX: this needs locking */
- tty_vhangup(current->signal->tty);
+ tty_vhangup_self();
return 0;
}
return -EPERM;
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index d17b4fd204e..9f5b054f06b 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -430,7 +430,7 @@ static struct file_system_type openprom_fs_type = {
.kill_sb = kill_anon_super,
};
-static void op_inode_init_once(struct kmem_cache * cachep, void *data)
+static void op_inode_init_once(void *data)
{
struct op_inode_info *oi = (struct op_inode_info *) data;
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 6149e4b58c8..7408227c49c 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -120,22 +120,21 @@ static int (*check_part[])(struct parsed_partitions *, struct block_device *) =
* a pointer to that same buffer (for convenience).
*/
-char *disk_name(struct gendisk *hd, int part, char *buf)
+char *disk_name(struct gendisk *hd, int partno, char *buf)
{
- if (!part)
+ if (!partno)
snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name);
else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1]))
- snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, part);
+ snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno);
else
- snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, part);
+ snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno);
return buf;
}
const char *bdevname(struct block_device *bdev, char *buf)
{
- int part = MINOR(bdev->bd_dev) - bdev->bd_disk->first_minor;
- return disk_name(bdev->bd_disk, part, buf);
+ return disk_name(bdev->bd_disk, bdev->bd_part->partno, buf);
}
EXPORT_SYMBOL(bdevname);
@@ -169,7 +168,7 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
if (isdigit(state->name[strlen(state->name)-1]))
sprintf(state->name, "p");
- state->limit = hd->minors;
+ state->limit = disk_max_parts(hd);
i = res = err = 0;
while (!res && check_part[i]) {
memset(&state->parts, 0, sizeof(state->parts));
@@ -204,21 +203,22 @@ static ssize_t part_start_show(struct device *dev,
return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect);
}
-static ssize_t part_size_show(struct device *dev,
- struct device_attribute *attr, char *buf)
+ssize_t part_size_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
struct hd_struct *p = dev_to_part(dev);
return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
}
-static ssize_t part_stat_show(struct device *dev,
- struct device_attribute *attr, char *buf)
+ssize_t part_stat_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
struct hd_struct *p = dev_to_part(dev);
+ int cpu;
- preempt_disable();
- part_round_stats(p);
- preempt_enable();
+ cpu = part_stat_lock();
+ part_round_stats(cpu, p);
+ part_stat_unlock();
return sprintf(buf,
"%8lu %8lu %8llu %8u "
"%8lu %8lu %8llu %8u "
@@ -238,17 +238,17 @@ static ssize_t part_stat_show(struct device *dev,
}
#ifdef CONFIG_FAIL_MAKE_REQUEST
-static ssize_t part_fail_show(struct device *dev,
- struct device_attribute *attr, char *buf)
+ssize_t part_fail_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
struct hd_struct *p = dev_to_part(dev);
return sprintf(buf, "%d\n", p->make_it_fail);
}
-static ssize_t part_fail_store(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
+ssize_t part_fail_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
{
struct hd_struct *p = dev_to_part(dev);
int i;
@@ -300,40 +300,34 @@ struct device_type part_type = {
.release = part_release,
};
-static inline void partition_sysfs_add_subdir(struct hd_struct *p)
-{
- struct kobject *k;
-
- k = kobject_get(&p->dev.kobj);
- p->holder_dir = kobject_create_and_add("holders", k);
- kobject_put(k);
-}
-
-static inline void disk_sysfs_add_subdirs(struct gendisk *disk)
+static void delete_partition_rcu_cb(struct rcu_head *head)
{
- struct kobject *k;
+ struct hd_struct *part = container_of(head, struct hd_struct, rcu_head);
- k = kobject_get(&disk->dev.kobj);
- disk->holder_dir = kobject_create_and_add("holders", k);
- disk->slave_dir = kobject_create_and_add("slaves", k);
- kobject_put(k);
+ part->start_sect = 0;
+ part->nr_sects = 0;
+ part_stat_set_all(part, 0);
+ put_device(part_to_dev(part));
}
-void delete_partition(struct gendisk *disk, int part)
+void delete_partition(struct gendisk *disk, int partno)
{
- struct hd_struct *p = disk->part[part-1];
+ struct disk_part_tbl *ptbl = disk->part_tbl;
+ struct hd_struct *part;
- if (!p)
+ if (partno >= ptbl->len)
return;
- if (!p->nr_sects)
+
+ part = ptbl->part[partno];
+ if (!part)
return;
- disk->part[part-1] = NULL;
- p->start_sect = 0;
- p->nr_sects = 0;
- part_stat_set_all(p, 0);
- kobject_put(p->holder_dir);
- device_del(&p->dev);
- put_device(&p->dev);
+
+ blk_free_devt(part_devt(part));
+ rcu_assign_pointer(ptbl->part[partno], NULL);
+ kobject_put(part->holder_dir);
+ device_del(part_to_dev(part));
+
+ call_rcu(&part->rcu_head, delete_partition_rcu_cb);
}
static ssize_t whole_disk_show(struct device *dev,
@@ -344,86 +338,132 @@ static ssize_t whole_disk_show(struct device *dev,
static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH,
whole_disk_show, NULL);
-void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len, int flags)
+int add_partition(struct gendisk *disk, int partno,
+ sector_t start, sector_t len, int flags)
{
struct hd_struct *p;
+ dev_t devt = MKDEV(0, 0);
+ struct device *ddev = disk_to_dev(disk);
+ struct device *pdev;
+ struct disk_part_tbl *ptbl;
+ const char *dname;
int err;
+ err = disk_expand_part_tbl(disk, partno);
+ if (err)
+ return err;
+ ptbl = disk->part_tbl;
+
+ if (ptbl->part[partno])
+ return -EBUSY;
+
p = kzalloc(sizeof(*p), GFP_KERNEL);
if (!p)
- return;
+ return -ENOMEM;
if (!init_part_stats(p)) {
- kfree(p);
- return;
+ err = -ENOMEM;
+ goto out_free;
}
+ pdev = part_to_dev(p);
+
p->start_sect = start;
p->nr_sects = len;
- p->partno = part;
- p->policy = disk->policy;
+ p->partno = partno;
+ p->policy = get_disk_ro(disk);
- if (isdigit(disk->dev.bus_id[strlen(disk->dev.bus_id)-1]))
- snprintf(p->dev.bus_id, BUS_ID_SIZE,
- "%sp%d", disk->dev.bus_id, part);
+ dname = dev_name(ddev);
+ if (isdigit(dname[strlen(dname) - 1]))
+ snprintf(pdev->bus_id, BUS_ID_SIZE, "%sp%d", dname, partno);
else
- snprintf(p->dev.bus_id, BUS_ID_SIZE,
- "%s%d", disk->dev.bus_id, part);
+ snprintf(pdev->bus_id, BUS_ID_SIZE, "%s%d", dname, partno);
- device_initialize(&p->dev);
- p->dev.devt = MKDEV(disk->major, disk->first_minor + part);
- p->dev.class = &block_class;
- p->dev.type = &part_type;
- p->dev.parent = &disk->dev;
- disk->part[part-1] = p;
+ device_initialize(pdev);
+ pdev->class = &block_class;
+ pdev->type = &part_type;
+ pdev->parent = ddev;
+
+ err = blk_alloc_devt(p, &devt);
+ if (err)
+ goto out_free;
+ pdev->devt = devt;
/* delay uevent until 'holders' subdir is created */
- p->dev.uevent_suppress = 1;
- device_add(&p->dev);
- partition_sysfs_add_subdir(p);
- p->dev.uevent_suppress = 0;
- if (flags & ADDPART_FLAG_WHOLEDISK)
- err = device_create_file(&p->dev, &dev_attr_whole_disk);
+ pdev->uevent_suppress = 1;
+ err = device_add(pdev);
+ if (err)
+ goto out_put;
+
+ err = -ENOMEM;
+ p->holder_dir = kobject_create_and_add("holders", &pdev->kobj);
+ if (!p->holder_dir)
+ goto out_del;
+
+ pdev->uevent_suppress = 0;
+ if (flags & ADDPART_FLAG_WHOLEDISK) {
+ err = device_create_file(pdev, &dev_attr_whole_disk);
+ if (err)
+ goto out_del;
+ }
+
+ /* everything is up and running, commence */
+ INIT_RCU_HEAD(&p->rcu_head);
+ rcu_assign_pointer(ptbl->part[partno], p);
/* suppress uevent if the disk supresses it */
- if (!disk->dev.uevent_suppress)
- kobject_uevent(&p->dev.kobj, KOBJ_ADD);
+ if (!ddev->uevent_suppress)
+ kobject_uevent(&pdev->kobj, KOBJ_ADD);
+
+ return 0;
+
+out_free:
+ kfree(p);
+ return err;
+out_del:
+ kobject_put(p->holder_dir);
+ device_del(pdev);
+out_put:
+ put_device(pdev);
+ blk_free_devt(devt);
+ return err;
}
/* Not exported, helper to add_disk(). */
void register_disk(struct gendisk *disk)
{
+ struct device *ddev = disk_to_dev(disk);
struct block_device *bdev;
+ struct disk_part_iter piter;
+ struct hd_struct *part;
char *s;
- int i;
- struct hd_struct *p;
int err;
- disk->dev.parent = disk->driverfs_dev;
- disk->dev.devt = MKDEV(disk->major, disk->first_minor);
+ ddev->parent = disk->driverfs_dev;
- strlcpy(disk->dev.bus_id, disk->disk_name, KOBJ_NAME_LEN);
+ strlcpy(ddev->bus_id, disk->disk_name, BUS_ID_SIZE);
/* ewww... some of these buggers have / in the name... */
- s = strchr(disk->dev.bus_id, '/');
+ s = strchr(ddev->bus_id, '/');
if (s)
*s = '!';
/* delay uevents, until we scanned partition table */
- disk->dev.uevent_suppress = 1;
+ ddev->uevent_suppress = 1;
- if (device_add(&disk->dev))
+ if (device_add(ddev))
return;
#ifndef CONFIG_SYSFS_DEPRECATED
- err = sysfs_create_link(block_depr, &disk->dev.kobj,
- kobject_name(&disk->dev.kobj));
+ err = sysfs_create_link(block_depr, &ddev->kobj,
+ kobject_name(&ddev->kobj));
if (err) {
- device_del(&disk->dev);
+ device_del(ddev);
return;
}
#endif
- disk_sysfs_add_subdirs(disk);
+ disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
+ disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
/* No minors to use for partitions */
- if (disk->minors == 1)
+ if (!disk_partitionable(disk))
goto exit;
/* No such device (e.g., media were just removed) */
@@ -442,51 +482,73 @@ void register_disk(struct gendisk *disk)
exit:
/* announce disk after possible partitions are created */
- disk->dev.uevent_suppress = 0;
- kobject_uevent(&disk->dev.kobj, KOBJ_ADD);
+ ddev->uevent_suppress = 0;
+ kobject_uevent(&ddev->kobj, KOBJ_ADD);
/* announce possible partitions */
- for (i = 1; i < disk->minors; i++) {
- p = disk->part[i-1];
- if (!p || !p->nr_sects)
- continue;
- kobject_uevent(&p->dev.kobj, KOBJ_ADD);
- }
+ disk_part_iter_init(&piter, disk, 0);
+ while ((part = disk_part_iter_next(&piter)))
+ kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
+ disk_part_iter_exit(&piter);
}
int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
{
+ struct disk_part_iter piter;
+ struct hd_struct *part;
struct parsed_partitions *state;
- int p, res;
+ int p, highest, res;
if (bdev->bd_part_count)
return -EBUSY;
res = invalidate_partition(disk, 0);
if (res)
return res;
- bdev->bd_invalidated = 0;
- for (p = 1; p < disk->minors; p++)
- delete_partition(disk, p);
+
+ disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
+ while ((part = disk_part_iter_next(&piter)))
+ delete_partition(disk, part->partno);
+ disk_part_iter_exit(&piter);
+
if (disk->fops->revalidate_disk)
disk->fops->revalidate_disk(disk);
+ check_disk_size_change(disk, bdev);
+ bdev->bd_invalidated = 0;
if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
return 0;
if (IS_ERR(state)) /* I/O error reading the partition table */
return -EIO;
/* tell userspace that the media / partition table may have changed */
- kobject_uevent(&disk->dev.kobj, KOBJ_CHANGE);
+ kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
+
+ /* Detect the highest partition number and preallocate
+ * disk->part_tbl. This is an optimization and not strictly
+ * necessary.
+ */
+ for (p = 1, highest = 0; p < state->limit; p++)
+ if (state->parts[p].size)
+ highest = p;
+
+ disk_expand_part_tbl(disk, highest);
+ /* add partitions */
for (p = 1; p < state->limit; p++) {
sector_t size = state->parts[p].size;
sector_t from = state->parts[p].from;
if (!size)
continue;
if (from + size > get_capacity(disk)) {
- printk(" %s: p%d exceeds device capacity\n",
+ printk(KERN_WARNING
+ "%s: p%d exceeds device capacity\n",
disk->disk_name, p);
}
- add_partition(disk, p, from, size, state->parts[p].flags);
+ res = add_partition(disk, p, from, size, state->parts[p].flags);
+ if (res) {
+ printk(KERN_ERR " %s: p%d could not be added: %d\n",
+ disk->disk_name, p, -res);
+ continue;
+ }
#ifdef CONFIG_BLK_DEV_MD
if (state->parts[p].flags & ADDPART_FLAG_RAID)
md_autodetect_dev(bdev->bd_dev+p);
@@ -519,25 +581,31 @@ EXPORT_SYMBOL(read_dev_sector);
void del_gendisk(struct gendisk *disk)
{
- int p;
+ struct disk_part_iter piter;
+ struct hd_struct *part;
/* invalidate stuff */
- for (p = disk->minors - 1; p > 0; p--) {
- invalidate_partition(disk, p);
- delete_partition(disk, p);
+ disk_part_iter_init(&piter, disk,
+ DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
+ while ((part = disk_part_iter_next(&piter))) {
+ invalidate_partition(disk, part->partno);
+ delete_partition(disk, part->partno);
}
+ disk_part_iter_exit(&piter);
+
invalidate_partition(disk, 0);
- disk->capacity = 0;
+ blk_free_devt(disk_to_dev(disk)->devt);
+ set_capacity(disk, 0);
disk->flags &= ~GENHD_FL_UP;
unlink_gendisk(disk);
- disk_stat_set_all(disk, 0);
- disk->stamp = 0;
+ part_stat_set_all(&disk->part0, 0);
+ disk->part0.stamp = 0;
- kobject_put(disk->holder_dir);
+ kobject_put(disk->part0.holder_dir);
kobject_put(disk->slave_dir);
disk->driverfs_dev = NULL;
#ifndef CONFIG_SYSFS_DEPRECATED
- sysfs_remove_link(block_depr, disk->dev.bus_id);
+ sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
#endif
- device_del(&disk->dev);
+ device_del(disk_to_dev(disk));
}
diff --git a/fs/partitions/check.h b/fs/partitions/check.h
index 17ae8ecd9e8..98dbe1a8452 100644
--- a/fs/partitions/check.h
+++ b/fs/partitions/check.h
@@ -5,15 +5,13 @@
* add_gd_partition adds a partitions details to the devices partition
* description.
*/
-enum { MAX_PART = 256 };
-
struct parsed_partitions {
char name[BDEVNAME_SIZE];
struct {
sector_t from;
sector_t size;
int flags;
- } parts[MAX_PART];
+ } parts[DISK_MAX_PARTS];
int next;
int limit;
};
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index e7b07006bc4..038a6022152 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -95,13 +95,6 @@
#include "check.h"
#include "efi.h"
-#undef EFI_DEBUG
-#ifdef EFI_DEBUG
-#define Dprintk(x...) printk(KERN_DEBUG x)
-#else
-#define Dprintk(x...)
-#endif
-
/* This allows a kernel command line option 'gpt' to override
* the test for invalid PMBR. Not __initdata because reloading
* the partition tables happens after init too.
@@ -305,10 +298,10 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
/* Check the GUID Partition Table signature */
if (le64_to_cpu((*gpt)->signature) != GPT_HEADER_SIGNATURE) {
- Dprintk("GUID Partition Table Header signature is wrong:"
- "%lld != %lld\n",
- (unsigned long long)le64_to_cpu((*gpt)->signature),
- (unsigned long long)GPT_HEADER_SIGNATURE);
+ pr_debug("GUID Partition Table Header signature is wrong:"
+ "%lld != %lld\n",
+ (unsigned long long)le64_to_cpu((*gpt)->signature),
+ (unsigned long long)GPT_HEADER_SIGNATURE);
goto fail;
}
@@ -318,9 +311,8 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
crc = efi_crc32((const unsigned char *) (*gpt), le32_to_cpu((*gpt)->header_size));
if (crc != origcrc) {
- Dprintk
- ("GUID Partition Table Header CRC is wrong: %x != %x\n",
- crc, origcrc);
+ pr_debug("GUID Partition Table Header CRC is wrong: %x != %x\n",
+ crc, origcrc);
goto fail;
}
(*gpt)->header_crc32 = cpu_to_le32(origcrc);
@@ -328,9 +320,9 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
/* Check that the my_lba entry points to the LBA that contains
* the GUID Partition Table */
if (le64_to_cpu((*gpt)->my_lba) != lba) {
- Dprintk("GPT my_lba incorrect: %lld != %lld\n",
- (unsigned long long)le64_to_cpu((*gpt)->my_lba),
- (unsigned long long)lba);
+ pr_debug("GPT my_lba incorrect: %lld != %lld\n",
+ (unsigned long long)le64_to_cpu((*gpt)->my_lba),
+ (unsigned long long)lba);
goto fail;
}
@@ -339,15 +331,15 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
*/
lastlba = last_lba(bdev);
if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) {
- Dprintk("GPT: first_usable_lba incorrect: %lld > %lld\n",
- (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba),
- (unsigned long long)lastlba);
+ pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n",
+ (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba),
+ (unsigned long long)lastlba);
goto fail;
}
if (le64_to_cpu((*gpt)->last_usable_lba) > lastlba) {
- Dprintk("GPT: last_usable_lba incorrect: %lld > %lld\n",
- (unsigned long long)le64_to_cpu((*gpt)->last_usable_lba),
- (unsigned long long)lastlba);
+ pr_debug("GPT: last_usable_lba incorrect: %lld > %lld\n",
+ (unsigned long long)le64_to_cpu((*gpt)->last_usable_lba),
+ (unsigned long long)lastlba);
goto fail;
}
@@ -360,7 +352,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
le32_to_cpu((*gpt)->sizeof_partition_entry));
if (crc != le32_to_cpu((*gpt)->partition_entry_array_crc32)) {
- Dprintk("GUID Partitition Entry Array CRC check failed.\n");
+ pr_debug("GUID Partitition Entry Array CRC check failed.\n");
goto fail_ptes;
}
@@ -616,7 +608,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
return 0;
}
- Dprintk("GUID Partition Table is valid! Yea!\n");
+ pr_debug("GUID Partition Table is valid! Yea!\n");
for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) {
if (!is_pte_valid(&ptes[i], last_lba(bdev)))
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 0fdda2e8a4c..8652fb99e96 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -133,17 +133,17 @@ static bool ldm_parse_privhead(const u8 *data, struct privhead *ph)
bool is_vista = false;
BUG_ON(!data || !ph);
- if (MAGIC_PRIVHEAD != BE64(data)) {
+ if (MAGIC_PRIVHEAD != get_unaligned_be64(data)) {
ldm_error("Cannot find PRIVHEAD structure. LDM database is"
" corrupt. Aborting.");
return false;
}
- ph->ver_major = BE16(data + 0x000C);
- ph->ver_minor = BE16(data + 0x000E);
- ph->logical_disk_start = BE64(data + 0x011B);
- ph->logical_disk_size = BE64(data + 0x0123);
- ph->config_start = BE64(data + 0x012B);
- ph->config_size = BE64(data + 0x0133);
+ ph->ver_major = get_unaligned_be16(data + 0x000C);
+ ph->ver_minor = get_unaligned_be16(data + 0x000E);
+ ph->logical_disk_start = get_unaligned_be64(data + 0x011B);
+ ph->logical_disk_size = get_unaligned_be64(data + 0x0123);
+ ph->config_start = get_unaligned_be64(data + 0x012B);
+ ph->config_size = get_unaligned_be64(data + 0x0133);
/* Version 2.11 is Win2k/XP and version 2.12 is Vista. */
if (ph->ver_major == 2 && ph->ver_minor == 12)
is_vista = true;
@@ -191,14 +191,14 @@ static bool ldm_parse_tocblock (const u8 *data, struct tocblock *toc)
{
BUG_ON (!data || !toc);
- if (MAGIC_TOCBLOCK != BE64 (data)) {
+ if (MAGIC_TOCBLOCK != get_unaligned_be64(data)) {
ldm_crit ("Cannot find TOCBLOCK, database may be corrupt.");
return false;
}
strncpy (toc->bitmap1_name, data + 0x24, sizeof (toc->bitmap1_name));
toc->bitmap1_name[sizeof (toc->bitmap1_name) - 1] = 0;
- toc->bitmap1_start = BE64 (data + 0x2E);
- toc->bitmap1_size = BE64 (data + 0x36);
+ toc->bitmap1_start = get_unaligned_be64(data + 0x2E);
+ toc->bitmap1_size = get_unaligned_be64(data + 0x36);
if (strncmp (toc->bitmap1_name, TOC_BITMAP1,
sizeof (toc->bitmap1_name)) != 0) {
@@ -208,8 +208,8 @@ static bool ldm_parse_tocblock (const u8 *data, struct tocblock *toc)
}
strncpy (toc->bitmap2_name, data + 0x46, sizeof (toc->bitmap2_name));
toc->bitmap2_name[sizeof (toc->bitmap2_name) - 1] = 0;
- toc->bitmap2_start = BE64 (data + 0x50);
- toc->bitmap2_size = BE64 (data + 0x58);
+ toc->bitmap2_start = get_unaligned_be64(data + 0x50);
+ toc->bitmap2_size = get_unaligned_be64(data + 0x58);
if (strncmp (toc->bitmap2_name, TOC_BITMAP2,
sizeof (toc->bitmap2_name)) != 0) {
ldm_crit ("TOCBLOCK's second bitmap is '%s', should be '%s'.",
@@ -237,22 +237,22 @@ static bool ldm_parse_vmdb (const u8 *data, struct vmdb *vm)
{
BUG_ON (!data || !vm);
- if (MAGIC_VMDB != BE32 (data)) {
+ if (MAGIC_VMDB != get_unaligned_be32(data)) {
ldm_crit ("Cannot find the VMDB, database may be corrupt.");
return false;
}
- vm->ver_major = BE16 (data + 0x12);
- vm->ver_minor = BE16 (data + 0x14);
+ vm->ver_major = get_unaligned_be16(data + 0x12);
+ vm->ver_minor = get_unaligned_be16(data + 0x14);
if ((vm->ver_major != 4) || (vm->ver_minor != 10)) {
ldm_error ("Expected VMDB version %d.%d, got %d.%d. "
"Aborting.", 4, 10, vm->ver_major, vm->ver_minor);
return false;
}
- vm->vblk_size = BE32 (data + 0x08);
- vm->vblk_offset = BE32 (data + 0x0C);
- vm->last_vblk_seq = BE32 (data + 0x04);
+ vm->vblk_size = get_unaligned_be32(data + 0x08);
+ vm->vblk_offset = get_unaligned_be32(data + 0x0C);
+ vm->last_vblk_seq = get_unaligned_be32(data + 0x04);
ldm_debug ("Parsed VMDB successfully.");
return true;
@@ -507,7 +507,7 @@ static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base,
goto out; /* Already logged */
/* Are there uncommitted transactions? */
- if (BE16(data + 0x10) != 0x01) {
+ if (get_unaligned_be16(data + 0x10) != 0x01) {
ldm_crit ("Database is not in a consistent state. Aborting.");
goto out;
}
@@ -802,7 +802,7 @@ static bool ldm_parse_cmp3 (const u8 *buffer, int buflen, struct vblk *vb)
return false;
len += VBLK_SIZE_CMP3;
- if (len != BE32 (buffer + 0x14))
+ if (len != get_unaligned_be32(buffer + 0x14))
return false;
comp = &vb->vblk.comp;
@@ -851,7 +851,7 @@ static int ldm_parse_dgr3 (const u8 *buffer, int buflen, struct vblk *vb)
return false;
len += VBLK_SIZE_DGR3;
- if (len != BE32 (buffer + 0x14))
+ if (len != get_unaligned_be32(buffer + 0x14))
return false;
dgrp = &vb->vblk.dgrp;
@@ -895,7 +895,7 @@ static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb)
return false;
len += VBLK_SIZE_DGR4;
- if (len != BE32 (buffer + 0x14))
+ if (len != get_unaligned_be32(buffer + 0x14))
return false;
dgrp = &vb->vblk.dgrp;
@@ -931,7 +931,7 @@ static bool ldm_parse_dsk3 (const u8 *buffer, int buflen, struct vblk *vb)
return false;
len += VBLK_SIZE_DSK3;
- if (len != BE32 (buffer + 0x14))
+ if (len != get_unaligned_be32(buffer + 0x14))
return false;
disk = &vb->vblk.disk;
@@ -968,7 +968,7 @@ static bool ldm_parse_dsk4 (const u8 *buffer, int buflen, struct vblk *vb)
return false;
len += VBLK_SIZE_DSK4;
- if (len != BE32 (buffer + 0x14))
+ if (len != get_unaligned_be32(buffer + 0x14))
return false;
disk = &vb->vblk.disk;
@@ -1034,14 +1034,14 @@ static bool ldm_parse_prt3(const u8 *buffer, int buflen, struct vblk *vb)
return false;
}
len += VBLK_SIZE_PRT3;
- if (len > BE32(buffer + 0x14)) {
+ if (len > get_unaligned_be32(buffer + 0x14)) {
ldm_error("len %d > BE32(buffer + 0x14) %d", len,
- BE32(buffer + 0x14));
+ get_unaligned_be32(buffer + 0x14));
return false;
}
part = &vb->vblk.part;
- part->start = BE64(buffer + 0x24 + r_name);
- part->volume_offset = BE64(buffer + 0x2C + r_name);
+ part->start = get_unaligned_be64(buffer + 0x24 + r_name);
+ part->volume_offset = get_unaligned_be64(buffer + 0x2C + r_name);
part->size = ldm_get_vnum(buffer + 0x34 + r_name);
part->parent_id = ldm_get_vnum(buffer + 0x34 + r_size);
part->disk_id = ldm_get_vnum(buffer + 0x34 + r_parent);
@@ -1139,9 +1139,9 @@ static bool ldm_parse_vol5(const u8 *buffer, int buflen, struct vblk *vb)
return false;
}
len += VBLK_SIZE_VOL5;
- if (len > BE32(buffer + 0x14)) {
+ if (len > get_unaligned_be32(buffer + 0x14)) {
ldm_error("len %d > BE32(buffer + 0x14) %d", len,
- BE32(buffer + 0x14));
+ get_unaligned_be32(buffer + 0x14));
return false;
}
volu = &vb->vblk.volu;
@@ -1294,9 +1294,9 @@ static bool ldm_frag_add (const u8 *data, int size, struct list_head *frags)
BUG_ON (!data || !frags);
- group = BE32 (data + 0x08);
- rec = BE16 (data + 0x0C);
- num = BE16 (data + 0x0E);
+ group = get_unaligned_be32(data + 0x08);
+ rec = get_unaligned_be16(data + 0x0C);
+ num = get_unaligned_be16(data + 0x0E);
if ((num < 1) || (num > 4)) {
ldm_error ("A VBLK claims to have %d parts.", num);
return false;
@@ -1425,12 +1425,12 @@ static bool ldm_get_vblks (struct block_device *bdev, unsigned long base,
}
for (v = 0; v < perbuf; v++, data+=size) { /* For each vblk */
- if (MAGIC_VBLK != BE32 (data)) {
+ if (MAGIC_VBLK != get_unaligned_be32(data)) {
ldm_error ("Expected to find a VBLK.");
goto out;
}
- recs = BE16 (data + 0x0E); /* Number of records */
+ recs = get_unaligned_be16(data + 0x0E); /* Number of records */
if (recs == 1) {
if (!ldm_ldmdb_add (data, size, ldb))
goto out; /* Already logged */
diff --git a/fs/partitions/ldm.h b/fs/partitions/ldm.h
index 80f63b5fdd9..30e08e809c1 100644
--- a/fs/partitions/ldm.h
+++ b/fs/partitions/ldm.h
@@ -98,11 +98,6 @@ struct parsed_partitions;
#define TOC_BITMAP1 "config" /* Names of the two defined */
#define TOC_BITMAP2 "log" /* bitmaps in the TOCBLOCK. */
-/* Most numbers we deal with are big-endian and won't be aligned. */
-#define BE16(x) ((u16)be16_to_cpu(get_unaligned((__be16*)(x))))
-#define BE32(x) ((u32)be32_to_cpu(get_unaligned((__be32*)(x))))
-#define BE64(x) ((u64)be64_to_cpu(get_unaligned((__be64*)(x))))
-
/* Borrowed from msdos.c */
#define SYS_IND(p) (get_unaligned(&(p)->sys_ind))
diff --git a/fs/pipe.c b/fs/pipe.c
index ec228bc9f88..fcba6542b8d 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -777,45 +777,10 @@ pipe_rdwr_open(struct inode *inode, struct file *filp)
/*
* The file_operations structs are not static because they
* are also used in linux/fs/fifo.c to do operations on FIFOs.
+ *
+ * Pipes reuse fifos' file_operations structs.
*/
-const struct file_operations read_fifo_fops = {
- .llseek = no_llseek,
- .read = do_sync_read,
- .aio_read = pipe_read,
- .write = bad_pipe_w,
- .poll = pipe_poll,
- .unlocked_ioctl = pipe_ioctl,
- .open = pipe_read_open,
- .release = pipe_read_release,
- .fasync = pipe_read_fasync,
-};
-
-const struct file_operations write_fifo_fops = {
- .llseek = no_llseek,
- .read = bad_pipe_r,
- .write = do_sync_write,
- .aio_write = pipe_write,
- .poll = pipe_poll,
- .unlocked_ioctl = pipe_ioctl,
- .open = pipe_write_open,
- .release = pipe_write_release,
- .fasync = pipe_write_fasync,
-};
-
-const struct file_operations rdwr_fifo_fops = {
- .llseek = no_llseek,
- .read = do_sync_read,
- .aio_read = pipe_read,
- .write = do_sync_write,
- .aio_write = pipe_write,
- .poll = pipe_poll,
- .unlocked_ioctl = pipe_ioctl,
- .open = pipe_rdwr_open,
- .release = pipe_rdwr_release,
- .fasync = pipe_rdwr_fasync,
-};
-
-static const struct file_operations read_pipe_fops = {
+const struct file_operations read_pipefifo_fops = {
.llseek = no_llseek,
.read = do_sync_read,
.aio_read = pipe_read,
@@ -827,7 +792,7 @@ static const struct file_operations read_pipe_fops = {
.fasync = pipe_read_fasync,
};
-static const struct file_operations write_pipe_fops = {
+const struct file_operations write_pipefifo_fops = {
.llseek = no_llseek,
.read = bad_pipe_r,
.write = do_sync_write,
@@ -839,7 +804,7 @@ static const struct file_operations write_pipe_fops = {
.fasync = pipe_write_fasync,
};
-static const struct file_operations rdwr_pipe_fops = {
+const struct file_operations rdwr_pipefifo_fops = {
.llseek = no_llseek,
.read = do_sync_read,
.aio_read = pipe_read,
@@ -927,7 +892,7 @@ static struct inode * get_pipe_inode(void)
inode->i_pipe = pipe;
pipe->readers = pipe->writers = 1;
- inode->i_fop = &rdwr_pipe_fops;
+ inode->i_fop = &rdwr_pipefifo_fops;
/*
* Mark the inode dirty from the very beginning,
@@ -950,7 +915,7 @@ fail_inode:
return NULL;
}
-struct file *create_write_pipe(void)
+struct file *create_write_pipe(int flags)
{
int err;
struct inode *inode;
@@ -978,12 +943,12 @@ struct file *create_write_pipe(void)
d_instantiate(dentry, inode);
err = -ENFILE;
- f = alloc_file(pipe_mnt, dentry, FMODE_WRITE, &write_pipe_fops);
+ f = alloc_file(pipe_mnt, dentry, FMODE_WRITE, &write_pipefifo_fops);
if (!f)
goto err_dentry;
f->f_mapping = inode->i_mapping;
- f->f_flags = O_WRONLY;
+ f->f_flags = O_WRONLY | (flags & O_NONBLOCK);
f->f_version = 0;
return f;
@@ -1003,51 +968,53 @@ struct file *create_write_pipe(void)
void free_write_pipe(struct file *f)
{
free_pipe_info(f->f_dentry->d_inode);
- dput(f->f_path.dentry);
- mntput(f->f_path.mnt);
+ path_put(&f->f_path);
put_filp(f);
}
-struct file *create_read_pipe(struct file *wrf)
+struct file *create_read_pipe(struct file *wrf, int flags)
{
struct file *f = get_empty_filp();
if (!f)
return ERR_PTR(-ENFILE);
/* Grab pipe from the writer */
- f->f_path.mnt = mntget(wrf->f_path.mnt);
- f->f_path.dentry = dget(wrf->f_path.dentry);
+ f->f_path = wrf->f_path;
+ path_get(&wrf->f_path);
f->f_mapping = wrf->f_path.dentry->d_inode->i_mapping;
f->f_pos = 0;
- f->f_flags = O_RDONLY;
- f->f_op = &read_pipe_fops;
+ f->f_flags = O_RDONLY | (flags & O_NONBLOCK);
+ f->f_op = &read_pipefifo_fops;
f->f_mode = FMODE_READ;
f->f_version = 0;
return f;
}
-int do_pipe(int *fd)
+int do_pipe_flags(int *fd, int flags)
{
struct file *fw, *fr;
int error;
int fdw, fdr;
- fw = create_write_pipe();
+ if (flags & ~(O_CLOEXEC | O_NONBLOCK))
+ return -EINVAL;
+
+ fw = create_write_pipe(flags);
if (IS_ERR(fw))
return PTR_ERR(fw);
- fr = create_read_pipe(fw);
+ fr = create_read_pipe(fw, flags);
error = PTR_ERR(fr);
if (IS_ERR(fr))
goto err_write_pipe;
- error = get_unused_fd();
+ error = get_unused_fd_flags(flags);
if (error < 0)
goto err_read_pipe;
fdr = error;
- error = get_unused_fd();
+ error = get_unused_fd_flags(flags);
if (error < 0)
goto err_fdr;
fdw = error;
@@ -1068,24 +1035,28 @@ int do_pipe(int *fd)
err_fdr:
put_unused_fd(fdr);
err_read_pipe:
- dput(fr->f_dentry);
- mntput(fr->f_vfsmnt);
+ path_put(&fr->f_path);
put_filp(fr);
err_write_pipe:
free_write_pipe(fw);
return error;
}
+int do_pipe(int *fd)
+{
+ return do_pipe_flags(fd, 0);
+}
+
/*
* sys_pipe() is the normal C calling standard for creating
* a pipe. It's not the way Unix traditionally does this, though.
*/
-asmlinkage long __weak sys_pipe(int __user *fildes)
+asmlinkage long __weak sys_pipe2(int __user *fildes, int flags)
{
int fd[2];
int error;
- error = do_pipe(fd);
+ error = do_pipe_flags(fd, flags);
if (!error) {
if (copy_to_user(fildes, fd, sizeof(fd))) {
sys_close(fd[0]);
@@ -1096,6 +1067,11 @@ asmlinkage long __weak sys_pipe(int __user *fildes)
return error;
}
+asmlinkage long __weak sys_pipe(int __user *fildes)
+{
+ return sys_pipe2(fildes, 0);
+}
+
/*
* pipefs should _never_ be mounted by userland - too much of security hassle,
* no real gain from having the whole whorehouse mounted. So we don't need
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
new file mode 100644
index 00000000000..50f8f0600f0
--- /dev/null
+++ b/fs/proc/Kconfig
@@ -0,0 +1,69 @@
+config PROC_FS
+ bool "/proc file system support" if EMBEDDED
+ default y
+ help
+ This is a virtual file system providing information about the status
+ of the system. "Virtual" means that it doesn't take up any space on
+ your hard disk: the files are created on the fly by the kernel when
+ you try to access them. Also, you cannot read the files with older
+ version of the program less: you need to use more or cat.
+
+ It's totally cool; for example, "cat /proc/interrupts" gives
+ information about what the different IRQs are used for at the moment
+ (there is a small number of Interrupt ReQuest lines in your computer
+ that are used by the attached devices to gain the CPU's attention --
+ often a source of trouble if two devices are mistakenly configured
+ to use the same IRQ). The program procinfo to display some
+ information about your system gathered from the /proc file system.
+
+ Before you can use the /proc file system, it has to be mounted,
+ meaning it has to be given a location in the directory hierarchy.
+ That location should be /proc. A command such as "mount -t proc proc
+ /proc" or the equivalent line in /etc/fstab does the job.
+
+ The /proc file system is explained in the file
+ <file:Documentation/filesystems/proc.txt> and on the proc(5) manpage
+ ("man 5 proc").
+
+ This option will enlarge your kernel by about 67 KB. Several
+ programs depend on this, so everyone should say Y here.
+
+config PROC_KCORE
+ bool "/proc/kcore support" if !ARM
+ depends on PROC_FS && MMU
+
+config PROC_VMCORE
+ bool "/proc/vmcore support (EXPERIMENTAL)"
+ depends on PROC_FS && CRASH_DUMP
+ default y
+ help
+ Exports the dump image of crashed kernel in ELF format.
+
+config PROC_SYSCTL
+ bool "Sysctl support (/proc/sys)" if EMBEDDED
+ depends on PROC_FS
+ select SYSCTL
+ default y
+ ---help---
+ The sysctl interface provides a means of dynamically changing
+ certain kernel parameters and variables on the fly without requiring
+ a recompile of the kernel or reboot of the system. The primary
+ interface is through /proc/sys. If you say Y here a tree of
+ modifiable sysctl entries will be generated beneath the
+ /proc/sys directory. They are explained in the files
+ in <file:Documentation/sysctl/>. Note that enabling this
+ option will enlarge the kernel by at least 8 KB.
+
+ As it is generally a good thing, you should say Y here unless
+ building a kernel for install/rescue disks or your system is very
+ limited in memory.
+
+config PROC_PAGE_MONITOR
+ default y
+ depends on PROC_FS && MMU
+ bool "Enable /proc page monitoring" if EMBEDDED
+ help
+ Various /proc files exist to monitor process memory utilization:
+ /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap,
+ /proc/kpagecount, and /proc/kpageflags. Disabling these
+ interfaces will reduce the size of the kernel by approximately 4kb.
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 797d775e035..f4bc0e78953 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -80,16 +80,12 @@
#include <linux/delayacct.h>
#include <linux/seq_file.h>
#include <linux/pid_namespace.h>
+#include <linux/tracehook.h>
#include <asm/pgtable.h>
#include <asm/processor.h>
#include "internal.h"
-/* Gcc optimizes away "strlen(x)" for constant x */
-#define ADDBUF(buffer, string) \
-do { memcpy(buffer, string, strlen(string)); \
- buffer += strlen(string); } while (0)
-
static inline void task_name(struct seq_file *m, struct task_struct *p)
{
int i;
@@ -168,8 +164,12 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
rcu_read_lock();
ppid = pid_alive(p) ?
task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0;
- tpid = pid_alive(p) && p->ptrace ?
- task_pid_nr_ns(rcu_dereference(p->parent), ns) : 0;
+ tpid = 0;
+ if (pid_alive(p)) {
+ struct task_struct *tracer = tracehook_tracer_task(p);
+ if (tracer)
+ tpid = task_pid_nr_ns(tracer, ns);
+ }
seq_printf(m,
"State:\t%s\n"
"Tgid:\t%d\n"
@@ -256,7 +256,6 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
sigemptyset(&ignored);
sigemptyset(&caught);
- rcu_read_lock();
if (lock_task_sighand(p, &flags)) {
pending = p->pending.signal;
shpending = p->signal->shared_pending.signal;
@@ -267,7 +266,6 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur;
unlock_task_sighand(p, &flags);
}
- rcu_read_unlock();
seq_printf(m, "Threads:\t%d\n", num_threads);
seq_printf(m, "SigQ:\t%lu/%lu\n", qsize, qlim);
@@ -332,65 +330,6 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
return 0;
}
-/*
- * Use precise platform statistics if available:
- */
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-static cputime_t task_utime(struct task_struct *p)
-{
- return p->utime;
-}
-
-static cputime_t task_stime(struct task_struct *p)
-{
- return p->stime;
-}
-#else
-static cputime_t task_utime(struct task_struct *p)
-{
- clock_t utime = cputime_to_clock_t(p->utime),
- total = utime + cputime_to_clock_t(p->stime);
- u64 temp;
-
- /*
- * Use CFS's precise accounting:
- */
- temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
-
- if (total) {
- temp *= utime;
- do_div(temp, total);
- }
- utime = (clock_t)temp;
-
- p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
- return p->prev_utime;
-}
-
-static cputime_t task_stime(struct task_struct *p)
-{
- clock_t stime;
-
- /*
- * Use CFS's precise accounting. (we subtract utime from
- * the total, to make sure the total observed by userspace
- * grows monotonically - apps rely on that):
- */
- stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
- cputime_to_clock_t(task_utime(p));
-
- if (stime >= 0)
- p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
-
- return p->prev_stime;
-}
-#endif
-
-static cputime_t task_gtime(struct task_struct *p)
-{
- return p->gtime;
-}
-
static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task, int whole)
{
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3b455371e7f..b5918ae8ca7 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -53,6 +53,7 @@
#include <linux/time.h>
#include <linux/proc_fs.h>
#include <linux/stat.h>
+#include <linux/task_io_accounting_ops.h>
#include <linux/init.h>
#include <linux/capability.h>
#include <linux/file.h>
@@ -69,6 +70,7 @@
#include <linux/mount.h>
#include <linux/security.h>
#include <linux/ptrace.h>
+#include <linux/tracehook.h>
#include <linux/cgroup.h>
#include <linux/cpuset.h>
#include <linux/audit.h>
@@ -146,9 +148,6 @@ static unsigned int pid_entry_count_dirs(const struct pid_entry *entries,
return count;
}
-int maps_protect;
-EXPORT_SYMBOL(maps_protect);
-
static struct fs_struct *get_fs_struct(struct task_struct *task)
{
struct fs_struct *fs;
@@ -162,7 +161,6 @@ static struct fs_struct *get_fs_struct(struct task_struct *task)
static int get_nr_threads(struct task_struct *tsk)
{
- /* Must be called with the rcu_read_lock held */
unsigned long flags;
int count = 0;
@@ -231,10 +229,14 @@ static int check_mem_permission(struct task_struct *task)
* If current is actively ptrace'ing, and would also be
* permitted to freshly attach with ptrace now, permit it.
*/
- if (task->parent == current && (task->ptrace & PT_PTRACED) &&
- task_is_stopped_or_traced(task) &&
- ptrace_may_attach(task))
- return 0;
+ if (task_is_stopped_or_traced(task)) {
+ int match;
+ rcu_read_lock();
+ match = (tracehook_tracer_task(task) == current);
+ rcu_read_unlock();
+ if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH))
+ return 0;
+ }
/*
* Noone else is allowed.
@@ -251,7 +253,8 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
task_lock(task);
if (task->mm != mm)
goto out;
- if (task->mm != current->mm && __ptrace_may_attach(task) < 0)
+ if (task->mm != current->mm &&
+ __ptrace_may_access(task, PTRACE_MODE_READ) < 0)
goto out;
task_unlock(task);
return mm;
@@ -464,14 +467,10 @@ static int proc_pid_limits(struct task_struct *task, char *buffer)
struct rlimit rlim[RLIM_NLIMITS];
- rcu_read_lock();
- if (!lock_task_sighand(task,&flags)) {
- rcu_read_unlock();
+ if (!lock_task_sighand(task, &flags))
return 0;
- }
memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS);
unlock_task_sighand(task, &flags);
- rcu_read_unlock();
/*
* print the file header
@@ -503,6 +502,26 @@ static int proc_pid_limits(struct task_struct *task, char *buffer)
return count;
}
+#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+static int proc_pid_syscall(struct task_struct *task, char *buffer)
+{
+ long nr;
+ unsigned long args[6], sp, pc;
+
+ if (task_current_syscall(task, &nr, args, 6, &sp, &pc))
+ return sprintf(buffer, "running\n");
+
+ if (nr < 0)
+ return sprintf(buffer, "%ld 0x%lx 0x%lx\n", nr, sp, pc);
+
+ return sprintf(buffer,
+ "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n",
+ nr,
+ args[0], args[1], args[2], args[3], args[4], args[5],
+ sp, pc);
+}
+#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
+
/************************************************************************/
/* Here the fs part begins */
/************************************************************************/
@@ -518,7 +537,7 @@ static int proc_fd_access_allowed(struct inode *inode)
*/
task = get_proc_task(inode);
if (task) {
- allowed = ptrace_may_attach(task);
+ allowed = ptrace_may_access(task, PTRACE_MODE_READ);
put_task_struct(task);
}
return allowed;
@@ -904,7 +923,7 @@ static ssize_t environ_read(struct file *file, char __user *buf,
if (!task)
goto out_no_task;
- if (!ptrace_may_attach(task))
+ if (!ptrace_may_access(task, PTRACE_MODE_READ))
goto out;
ret = -ENOMEM;
@@ -1833,8 +1852,7 @@ static const struct file_operations proc_fd_operations = {
* /proc/pid/fd needs a special permission handler so that a process can still
* access /proc/self/fd after it has executed a setuid().
*/
-static int proc_fd_permission(struct inode *inode, int mask,
- struct nameidata *nd)
+static int proc_fd_permission(struct inode *inode, int mask)
{
int rv;
@@ -2375,29 +2393,54 @@ static int proc_base_fill_cache(struct file *filp, void *dirent,
}
#ifdef CONFIG_TASK_IO_ACCOUNTING
-static int proc_pid_io_accounting(struct task_struct *task, char *buffer)
+static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
{
+ struct task_io_accounting acct = task->ioac;
+ unsigned long flags;
+
+ if (whole && lock_task_sighand(task, &flags)) {
+ struct task_struct *t = task;
+
+ task_io_accounting_add(&acct, &task->signal->ioac);
+ while_each_thread(task, t)
+ task_io_accounting_add(&acct, &t->ioac);
+
+ unlock_task_sighand(task, &flags);
+ }
return sprintf(buffer,
-#ifdef CONFIG_TASK_XACCT
"rchar: %llu\n"
"wchar: %llu\n"
"syscr: %llu\n"
"syscw: %llu\n"
-#endif
"read_bytes: %llu\n"
"write_bytes: %llu\n"
"cancelled_write_bytes: %llu\n",
-#ifdef CONFIG_TASK_XACCT
- (unsigned long long)task->rchar,
- (unsigned long long)task->wchar,
- (unsigned long long)task->syscr,
- (unsigned long long)task->syscw,
-#endif
- (unsigned long long)task->ioac.read_bytes,
- (unsigned long long)task->ioac.write_bytes,
- (unsigned long long)task->ioac.cancelled_write_bytes);
+ (unsigned long long)acct.rchar,
+ (unsigned long long)acct.wchar,
+ (unsigned long long)acct.syscr,
+ (unsigned long long)acct.syscw,
+ (unsigned long long)acct.read_bytes,
+ (unsigned long long)acct.write_bytes,
+ (unsigned long long)acct.cancelled_write_bytes);
+}
+
+static int proc_tid_io_accounting(struct task_struct *task, char *buffer)
+{
+ return do_io_accounting(task, buffer, 0);
+}
+
+static int proc_tgid_io_accounting(struct task_struct *task, char *buffer)
+{
+ return do_io_accounting(task, buffer, 1);
+}
+#endif /* CONFIG_TASK_IO_ACCOUNTING */
+
+static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ seq_printf(m, "%08x\n", task->personality);
+ return 0;
}
-#endif
/*
* Thread groups
@@ -2415,10 +2458,14 @@ static const struct pid_entry tgid_base_stuff[] = {
REG("environ", S_IRUSR, environ),
INF("auxv", S_IRUSR, pid_auxv),
ONE("status", S_IRUGO, pid_status),
+ ONE("personality", S_IRUSR, pid_personality),
INF("limits", S_IRUSR, pid_limits),
#ifdef CONFIG_SCHED_DEBUG
REG("sched", S_IRUGO|S_IWUSR, pid_sched),
#endif
+#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+ INF("syscall", S_IRUSR, pid_syscall),
+#endif
INF("cmdline", S_IRUGO, pid_cmdline),
ONE("stat", S_IRUGO, tgid_stat),
ONE("statm", S_IRUGO, pid_statm),
@@ -2469,7 +2516,7 @@ static const struct pid_entry tgid_base_stuff[] = {
REG("coredump_filter", S_IRUGO|S_IWUSR, coredump_filter),
#endif
#ifdef CONFIG_TASK_IO_ACCOUNTING
- INF("io", S_IRUGO, pid_io_accounting),
+ INF("io", S_IRUGO, tgid_io_accounting),
#endif
};
@@ -2747,10 +2794,14 @@ static const struct pid_entry tid_base_stuff[] = {
REG("environ", S_IRUSR, environ),
INF("auxv", S_IRUSR, pid_auxv),
ONE("status", S_IRUGO, pid_status),
+ ONE("personality", S_IRUSR, pid_personality),
INF("limits", S_IRUSR, pid_limits),
#ifdef CONFIG_SCHED_DEBUG
REG("sched", S_IRUGO|S_IWUSR, pid_sched),
#endif
+#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
+ INF("syscall", S_IRUSR, pid_syscall),
+#endif
INF("cmdline", S_IRUGO, pid_cmdline),
ONE("stat", S_IRUGO, tid_stat),
ONE("statm", S_IRUGO, pid_statm),
@@ -2796,6 +2847,9 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_FAULT_INJECTION
REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject),
#endif
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+ INF("io", S_IRUGO, tid_io_accounting),
+#endif
};
static int proc_tid_base_readdir(struct file * filp,
@@ -3035,9 +3089,7 @@ static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
generic_fillattr(inode, stat);
if (p) {
- rcu_read_lock();
stat->nlink += get_nr_threads(p);
- rcu_read_unlock();
put_task_struct(p);
}
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 43e54e86cef..7821589a17d 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -300,10 +300,10 @@ out:
return rtn;
}
-static DEFINE_IDR(proc_inum_idr);
+static DEFINE_IDA(proc_inum_ida);
static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */
-#define PROC_DYNAMIC_FIRST 0xF0000000UL
+#define PROC_DYNAMIC_FIRST 0xF0000000U
/*
* Return an inode number between PROC_DYNAMIC_FIRST and
@@ -311,36 +311,34 @@ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */
*/
static unsigned int get_inode_number(void)
{
- int i, inum = 0;
+ unsigned int i;
int error;
retry:
- if (idr_pre_get(&proc_inum_idr, GFP_KERNEL) == 0)
+ if (ida_pre_get(&proc_inum_ida, GFP_KERNEL) == 0)
return 0;
spin_lock(&proc_inum_lock);
- error = idr_get_new(&proc_inum_idr, NULL, &i);
+ error = ida_get_new(&proc_inum_ida, &i);
spin_unlock(&proc_inum_lock);
if (error == -EAGAIN)
goto retry;
else if (error)
return 0;
- inum = (i & MAX_ID_MASK) + PROC_DYNAMIC_FIRST;
-
- /* inum will never be more than 0xf0ffffff, so no check
- * for overflow.
- */
-
- return inum;
+ if (i > UINT_MAX - PROC_DYNAMIC_FIRST) {
+ spin_lock(&proc_inum_lock);
+ ida_remove(&proc_inum_ida, i);
+ spin_unlock(&proc_inum_lock);
+ return 0;
+ }
+ return PROC_DYNAMIC_FIRST + i;
}
static void release_inode_number(unsigned int inum)
{
- int id = (inum - PROC_DYNAMIC_FIRST) | ~MAX_ID_MASK;
-
spin_lock(&proc_inum_lock);
- idr_remove(&proc_inum_idr, id);
+ ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST);
spin_unlock(&proc_inum_lock);
}
@@ -549,8 +547,8 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
for (tmp = dir->subdir; tmp; tmp = tmp->next)
if (strcmp(tmp->name, dp->name) == 0) {
- printk(KERN_WARNING "proc_dir_entry '%s' already "
- "registered\n", dp->name);
+ printk(KERN_WARNING "proc_dir_entry '%s/%s' already registered\n",
+ dir->name, dp->name);
dump_stack();
break;
}
@@ -597,6 +595,7 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
ent->pde_users = 0;
spin_lock_init(&ent->pde_unload_lock);
ent->pde_unload_completion = NULL;
+ INIT_LIST_HEAD(&ent->pde_openers);
out:
return ent;
}
@@ -789,15 +788,25 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
spin_unlock(&de->pde_unload_lock);
continue_removing:
+ spin_lock(&de->pde_unload_lock);
+ while (!list_empty(&de->pde_openers)) {
+ struct pde_opener *pdeo;
+
+ pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh);
+ list_del(&pdeo->lh);
+ spin_unlock(&de->pde_unload_lock);
+ pdeo->release(pdeo->inode, pdeo->file);
+ kfree(pdeo);
+ spin_lock(&de->pde_unload_lock);
+ }
+ spin_unlock(&de->pde_unload_lock);
+
if (S_ISDIR(de->mode))
parent->nlink--;
de->nlink = 0;
- if (de->subdir) {
- printk(KERN_WARNING "%s: removing non-empty directory "
+ WARN(de->subdir, KERN_WARNING "%s: removing non-empty directory "
"'%s/%s', leaking at least '%s'\n", __func__,
de->parent->name, de->name, de->subdir->name);
- WARN_ON(1);
- }
if (atomic_dec_and_test(&de->count))
free_proc_entry(de);
}
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index b08d1001791..c6b4fa7e3b4 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -17,6 +17,7 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/smp_lock.h>
+#include <linux/sysctl.h>
#include <asm/system.h>
#include <asm/uaccess.h>
@@ -65,6 +66,8 @@ static void proc_delete_inode(struct inode *inode)
module_put(de->owner);
de_put(de);
}
+ if (PROC_I(inode)->sysctl)
+ sysctl_head_put(PROC_I(inode)->sysctl);
clear_inode(inode);
}
@@ -84,6 +87,8 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
ei->fd = 0;
ei->op.proc_get_link = NULL;
ei->pde = NULL;
+ ei->sysctl = NULL;
+ ei->sysctl_entry = NULL;
inode = &ei->vfs_inode;
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
return inode;
@@ -94,7 +99,7 @@ static void proc_destroy_inode(struct inode *inode)
kmem_cache_free(proc_inode_cachep, PROC_I(inode));
}
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
{
struct proc_inode *ei = (struct proc_inode *) foo;
@@ -111,27 +116,25 @@ int __init proc_init_inodecache(void)
return 0;
}
-static int proc_remount(struct super_block *sb, int *flags, char *data)
-{
- *flags |= MS_NODIRATIME;
- return 0;
-}
-
static const struct super_operations proc_sops = {
.alloc_inode = proc_alloc_inode,
.destroy_inode = proc_destroy_inode,
.drop_inode = generic_delete_inode,
.delete_inode = proc_delete_inode,
.statfs = simple_statfs,
- .remount_fs = proc_remount,
};
-static void pde_users_dec(struct proc_dir_entry *pde)
+static void __pde_users_dec(struct proc_dir_entry *pde)
{
- spin_lock(&pde->pde_unload_lock);
pde->pde_users--;
if (pde->pde_unload_completion && pde->pde_users == 0)
complete(pde->pde_unload_completion);
+}
+
+static void pde_users_dec(struct proc_dir_entry *pde)
+{
+ spin_lock(&pde->pde_unload_lock);
+ __pde_users_dec(pde);
spin_unlock(&pde->pde_unload_lock);
}
@@ -318,36 +321,97 @@ static int proc_reg_open(struct inode *inode, struct file *file)
struct proc_dir_entry *pde = PDE(inode);
int rv = 0;
int (*open)(struct inode *, struct file *);
+ int (*release)(struct inode *, struct file *);
+ struct pde_opener *pdeo;
+
+ /*
+ * What for, you ask? Well, we can have open, rmmod, remove_proc_entry
+ * sequence. ->release won't be called because ->proc_fops will be
+ * cleared. Depending on complexity of ->release, consequences vary.
+ *
+ * We can't wait for mercy when close will be done for real, it's
+ * deadlockable: rmmod foo </proc/foo . So, we're going to do ->release
+ * by hand in remove_proc_entry(). For this, save opener's credentials
+ * for later.
+ */
+ pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL);
+ if (!pdeo)
+ return -ENOMEM;
spin_lock(&pde->pde_unload_lock);
if (!pde->proc_fops) {
spin_unlock(&pde->pde_unload_lock);
- return rv;
+ kfree(pdeo);
+ return -EINVAL;
}
pde->pde_users++;
open = pde->proc_fops->open;
+ release = pde->proc_fops->release;
spin_unlock(&pde->pde_unload_lock);
if (open)
rv = open(inode, file);
- pde_users_dec(pde);
+ spin_lock(&pde->pde_unload_lock);
+ if (rv == 0 && release) {
+ /* To know what to release. */
+ pdeo->inode = inode;
+ pdeo->file = file;
+ /* Strictly for "too late" ->release in proc_reg_release(). */
+ pdeo->release = release;
+ list_add(&pdeo->lh, &pde->pde_openers);
+ } else
+ kfree(pdeo);
+ __pde_users_dec(pde);
+ spin_unlock(&pde->pde_unload_lock);
return rv;
}
+static struct pde_opener *find_pde_opener(struct proc_dir_entry *pde,
+ struct inode *inode, struct file *file)
+{
+ struct pde_opener *pdeo;
+
+ list_for_each_entry(pdeo, &pde->pde_openers, lh) {
+ if (pdeo->inode == inode && pdeo->file == file)
+ return pdeo;
+ }
+ return NULL;
+}
+
static int proc_reg_release(struct inode *inode, struct file *file)
{
struct proc_dir_entry *pde = PDE(inode);
int rv = 0;
int (*release)(struct inode *, struct file *);
+ struct pde_opener *pdeo;
spin_lock(&pde->pde_unload_lock);
+ pdeo = find_pde_opener(pde, inode, file);
if (!pde->proc_fops) {
- spin_unlock(&pde->pde_unload_lock);
+ /*
+ * Can't simply exit, __fput() will think that everything is OK,
+ * and move on to freeing struct file. remove_proc_entry() will
+ * find slacker in opener's list and will try to do non-trivial
+ * things with struct file. Therefore, remove opener from list.
+ *
+ * But if opener is removed from list, who will ->release it?
+ */
+ if (pdeo) {
+ list_del(&pdeo->lh);
+ spin_unlock(&pde->pde_unload_lock);
+ rv = pdeo->release(inode, file);
+ kfree(pdeo);
+ } else
+ spin_unlock(&pde->pde_unload_lock);
return rv;
}
pde->pde_users++;
release = pde->proc_fops->release;
+ if (pdeo) {
+ list_del(&pdeo->lh);
+ kfree(pdeo);
+ }
spin_unlock(&pde->pde_unload_lock);
if (release)
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 28cbca80590..3bfb7b8747b 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -45,8 +45,6 @@ do { \
extern int nommu_vma_show(struct seq_file *, struct vm_area_struct *);
#endif
-extern int maps_protect;
-
extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task);
extern int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns,
@@ -63,6 +61,7 @@ extern const struct file_operations proc_smaps_operations;
extern const struct file_operations proc_clear_refs_operations;
extern const struct file_operations proc_pagemap_operations;
extern const struct file_operations proc_net_operations;
+extern const struct file_operations proc_kmsg_operations;
extern const struct inode_operations proc_net_inode_operations;
void free_proc_entry(struct proc_dir_entry *de);
@@ -88,3 +87,10 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *ino,
struct dentry *dentry);
int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
filldir_t filldir);
+
+struct pde_opener {
+ struct inode *inode;
+ struct file *file;
+ int (*release)(struct inode *, struct file *);
+ struct list_head lh;
+};
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index e78c81fcf54..c2370c76fb7 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -23,6 +23,10 @@
#define CORE_STR "CORE"
+#ifndef ELF_CORE_EFLAGS
+#define ELF_CORE_EFLAGS 0
+#endif
+
static int open_kcore(struct inode * inode, struct file * filp)
{
return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
@@ -164,11 +168,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
elf->e_entry = 0;
elf->e_phoff = sizeof(struct elfhdr);
elf->e_shoff = 0;
-#if defined(CONFIG_H8300)
- elf->e_flags = ELF_FLAGS;
-#else
- elf->e_flags = 0;
-#endif
+ elf->e_flags = ELF_CORE_EFLAGS;
elf->e_ehsize = sizeof(struct elfhdr);
elf->e_phentsize= sizeof(struct elf_phdr);
elf->e_phnum = nphdr;
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index ff3b90b56e9..9fd5df3f40c 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -15,6 +15,8 @@
#include <asm/uaccess.h>
#include <asm/io.h>
+#include "internal.h"
+
extern wait_queue_head_t log_wait;
extern int do_syslog(int type, char __user *bug, int count);
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 79ecd281d2c..3f87d263294 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -52,14 +52,14 @@ int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
}
seq_printf(m,
- "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n",
+ "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
vma->vm_start,
vma->vm_end,
flags & VM_READ ? 'r' : '-',
flags & VM_WRITE ? 'w' : '-',
flags & VM_EXEC ? 'x' : '-',
flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
- vma->vm_pgoff << PAGE_SHIFT,
+ ((loff_t)vma->vm_pgoff) << PAGE_SHIFT,
MAJOR(dev), MINOR(dev), ino, &len);
if (file) {
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 7e277f2ad46..b675a49c182 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -24,6 +24,7 @@
#include <linux/tty.h>
#include <linux/string.h>
#include <linux/mman.h>
+#include <linux/quicklist.h>
#include <linux/proc_fs.h>
#include <linux/ioport.h>
#include <linux/mm.h>
@@ -67,7 +68,6 @@
extern int get_hardware_list(char *);
extern int get_stram_list(char *);
extern int get_exec_domain_list(char *);
-extern int get_dma_list(char *);
static int proc_calc_metrics(char *page, char **start, off_t off,
int count, int *eof, int len)
@@ -123,6 +123,11 @@ static int uptime_read_proc(char *page, char **start, off_t off,
return proc_calc_metrics(page, start, off, count, eof, len);
}
+int __attribute__((weak)) arch_report_meminfo(char *page)
+{
+ return 0;
+}
+
static int meminfo_read_proc(char *page, char **start, off_t off,
int count, int *eof, void *data)
{
@@ -177,6 +182,9 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
"SReclaimable: %8lu kB\n"
"SUnreclaim: %8lu kB\n"
"PageTables: %8lu kB\n"
+#ifdef CONFIG_QUICKLIST
+ "Quicklists: %8lu kB\n"
+#endif
"NFS_Unstable: %8lu kB\n"
"Bounce: %8lu kB\n"
"WritebackTmp: %8lu kB\n"
@@ -209,6 +217,9 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
K(global_page_state(NR_SLAB_RECLAIMABLE)),
K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
K(global_page_state(NR_PAGETABLE)),
+#ifdef CONFIG_QUICKLIST
+ K(quicklist_total_size()),
+#endif
K(global_page_state(NR_UNSTABLE_NFS)),
K(global_page_state(NR_BOUNCE)),
K(global_page_state(NR_WRITEBACK_TEMP)),
@@ -221,11 +232,12 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
len += hugetlb_report_meminfo(page + len);
+ len += arch_report_meminfo(page + len);
+
return proc_calc_metrics(page, start, off, count, eof, len);
#undef K
}
-extern const struct seq_operations fragmentation_op;
static int fragmentation_open(struct inode *inode, struct file *file)
{
(void)inode;
@@ -239,7 +251,6 @@ static const struct file_operations fragmentation_file_operations = {
.release = seq_release,
};
-extern const struct seq_operations pagetypeinfo_op;
static int pagetypeinfo_open(struct inode *inode, struct file *file)
{
return seq_open(file, &pagetypeinfo_op);
@@ -252,7 +263,6 @@ static const struct file_operations pagetypeinfo_file_ops = {
.release = seq_release,
};
-extern const struct seq_operations zoneinfo_op;
static int zoneinfo_open(struct inode *inode, struct file *file)
{
return seq_open(file, &zoneinfo_op);
@@ -349,7 +359,6 @@ static const struct file_operations proc_devinfo_operations = {
.release = seq_release,
};
-extern const struct seq_operations vmstat_op;
static int vmstat_open(struct inode *inode, struct file *file)
{
return seq_open(file, &vmstat_op);
@@ -461,17 +470,35 @@ static const struct file_operations proc_slabstats_operations = {
#ifdef CONFIG_MMU
static int vmalloc_open(struct inode *inode, struct file *file)
{
- return seq_open(file, &vmalloc_op);
+ unsigned int *ptr = NULL;
+ int ret;
+
+ if (NUMA_BUILD)
+ ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
+ ret = seq_open(file, &vmalloc_op);
+ if (!ret) {
+ struct seq_file *m = file->private_data;
+ m->private = ptr;
+ } else
+ kfree(ptr);
+ return ret;
}
static const struct file_operations proc_vmalloc_operations = {
.open = vmalloc_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release,
+ .release = seq_release_private,
};
#endif
+#ifndef arch_irq_stat_cpu
+#define arch_irq_stat_cpu(cpu) 0
+#endif
+#ifndef arch_irq_stat
+#define arch_irq_stat() 0
+#endif
+
static int show_stat(struct seq_file *p, void *v)
{
int i;
@@ -509,7 +536,9 @@ static int show_stat(struct seq_file *p, void *v)
sum += temp;
per_irq_sum[j] += temp;
}
+ sum += arch_irq_stat_cpu(i);
}
+ sum += arch_irq_stat();
seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
(unsigned long long)cputime64_to_clock_t(user),
@@ -654,6 +683,7 @@ static int cmdline_read_proc(char *page, char **start, off_t off,
return proc_calc_metrics(page, start, off, count, eof, len);
}
+#ifdef CONFIG_FILE_LOCKING
static int locks_open(struct inode *inode, struct file *filp)
{
return seq_open(filp, &locks_seq_operations);
@@ -665,6 +695,7 @@ static const struct file_operations proc_locks_operations = {
.llseek = seq_lseek,
.release = seq_release,
};
+#endif /* CONFIG_FILE_LOCKING */
static int execdomains_read_proc(char *page, char **start, off_t off,
int count, int *eof, void *data)
@@ -858,7 +889,9 @@ void __init proc_misc_init(void)
#ifdef CONFIG_PRINTK
proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations);
#endif
+#ifdef CONFIG_FILE_LOCKING
proc_create("locks", 0, NULL, &proc_locks_operations);
+#endif
proc_create("devices", 0, NULL, &proc_devinfo_operations);
proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations);
#ifdef CONFIG_BLOCK
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 83f357b30d7..7bc296f424a 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -27,6 +27,11 @@
#include "internal.h"
+static struct net *get_proc_net(const struct inode *inode)
+{
+ return maybe_get_net(PDE_NET(PDE(inode)));
+}
+
int seq_open_net(struct inode *ino, struct file *f,
const struct seq_operations *ops, int size)
{
@@ -51,6 +56,30 @@ int seq_open_net(struct inode *ino, struct file *f,
}
EXPORT_SYMBOL_GPL(seq_open_net);
+int single_open_net(struct inode *inode, struct file *file,
+ int (*show)(struct seq_file *, void *))
+{
+ int err;
+ struct net *net;
+
+ err = -ENXIO;
+ net = get_proc_net(inode);
+ if (net == NULL)
+ goto err_net;
+
+ err = single_open(file, show, net);
+ if (err < 0)
+ goto err_open;
+
+ return 0;
+
+err_open:
+ put_net(net);
+err_net:
+ return err;
+}
+EXPORT_SYMBOL_GPL(single_open_net);
+
int seq_release_net(struct inode *ino, struct file *f)
{
struct seq_file *seq;
@@ -63,6 +92,14 @@ int seq_release_net(struct inode *ino, struct file *f)
}
EXPORT_SYMBOL_GPL(seq_release_net);
+int single_release_net(struct inode *ino, struct file *f)
+{
+ struct seq_file *seq = f->private_data;
+ put_net(seq->private);
+ return single_release(ino, f);
+}
+EXPORT_SYMBOL_GPL(single_release_net);
+
static struct net *get_proc_task_net(struct inode *dir)
{
struct task_struct *task;
@@ -153,12 +190,6 @@ void proc_net_remove(struct net *net, const char *name)
}
EXPORT_SYMBOL_GPL(proc_net_remove);
-struct net *get_proc_net(const struct inode *inode)
-{
- return maybe_get_net(PDE_NET(PDE(inode)));
-}
-EXPORT_SYMBOL_GPL(get_proc_net);
-
static __net_init int proc_net_ns_init(struct net *net)
{
struct proc_dir_entry *netd, *net_statd;
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 5acc001d49f..945a81043ba 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -10,149 +10,110 @@
static struct dentry_operations proc_sys_dentry_operations;
static const struct file_operations proc_sys_file_operations;
static const struct inode_operations proc_sys_inode_operations;
+static const struct file_operations proc_sys_dir_file_operations;
+static const struct inode_operations proc_sys_dir_operations;
-static void proc_sys_refresh_inode(struct inode *inode, struct ctl_table *table)
-{
- /* Refresh the cached information bits in the inode */
- if (table) {
- inode->i_uid = 0;
- inode->i_gid = 0;
- inode->i_mode = table->mode;
- if (table->proc_handler) {
- inode->i_mode |= S_IFREG;
- inode->i_nlink = 1;
- } else {
- inode->i_mode |= S_IFDIR;
- inode->i_nlink = 0; /* It is too hard to figure out */
- }
- }
-}
-
-static struct inode *proc_sys_make_inode(struct inode *dir, struct ctl_table *table)
+static struct inode *proc_sys_make_inode(struct super_block *sb,
+ struct ctl_table_header *head, struct ctl_table *table)
{
struct inode *inode;
- struct proc_inode *dir_ei, *ei;
- int depth;
+ struct proc_inode *ei;
- inode = new_inode(dir->i_sb);
+ inode = new_inode(sb);
if (!inode)
goto out;
- /* A directory is always one deeper than it's parent */
- dir_ei = PROC_I(dir);
- depth = dir_ei->fd + 1;
-
+ sysctl_head_get(head);
ei = PROC_I(inode);
- ei->fd = depth;
+ ei->sysctl = head;
+ ei->sysctl_entry = table;
+
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
- inode->i_op = &proc_sys_inode_operations;
- inode->i_fop = &proc_sys_file_operations;
inode->i_flags |= S_PRIVATE; /* tell selinux to ignore this inode */
- proc_sys_refresh_inode(inode, table);
+ inode->i_mode = table->mode;
+ if (!table->child) {
+ inode->i_mode |= S_IFREG;
+ inode->i_op = &proc_sys_inode_operations;
+ inode->i_fop = &proc_sys_file_operations;
+ } else {
+ inode->i_mode |= S_IFDIR;
+ inode->i_nlink = 0;
+ inode->i_op = &proc_sys_dir_operations;
+ inode->i_fop = &proc_sys_dir_file_operations;
+ }
out:
return inode;
}
-static struct dentry *proc_sys_ancestor(struct dentry *dentry, int depth)
-{
- for (;;) {
- struct proc_inode *ei;
-
- ei = PROC_I(dentry->d_inode);
- if (ei->fd == depth)
- break; /* found */
-
- dentry = dentry->d_parent;
- }
- return dentry;
-}
-
-static struct ctl_table *proc_sys_lookup_table_one(struct ctl_table *table,
- struct qstr *name)
+static struct ctl_table *find_in_table(struct ctl_table *p, struct qstr *name)
{
int len;
- for ( ; table->ctl_name || table->procname; table++) {
+ for ( ; p->ctl_name || p->procname; p++) {
- if (!table->procname)
+ if (!p->procname)
continue;
- len = strlen(table->procname);
+ len = strlen(p->procname);
if (len != name->len)
continue;
- if (memcmp(table->procname, name->name, len) != 0)
+ if (memcmp(p->procname, name->name, len) != 0)
continue;
/* I have a match */
- return table;
+ return p;
}
return NULL;
}
-static struct ctl_table *proc_sys_lookup_table(struct dentry *dentry,
- struct ctl_table *table)
+static struct ctl_table_header *grab_header(struct inode *inode)
{
- struct dentry *ancestor;
- struct proc_inode *ei;
- int depth, i;
+ if (PROC_I(inode)->sysctl)
+ return sysctl_head_grab(PROC_I(inode)->sysctl);
+ else
+ return sysctl_head_next(NULL);
+}
- ei = PROC_I(dentry->d_inode);
- depth = ei->fd;
+static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
+ struct nameidata *nd)
+{
+ struct ctl_table_header *head = grab_header(dir);
+ struct ctl_table *table = PROC_I(dir)->sysctl_entry;
+ struct ctl_table_header *h = NULL;
+ struct qstr *name = &dentry->d_name;
+ struct ctl_table *p;
+ struct inode *inode;
+ struct dentry *err = ERR_PTR(-ENOENT);
- if (depth == 0)
- return table;
+ if (IS_ERR(head))
+ return ERR_CAST(head);
- for (i = 1; table && (i <= depth); i++) {
- ancestor = proc_sys_ancestor(dentry, i);
- table = proc_sys_lookup_table_one(table, &ancestor->d_name);
- if (table)
- table = table->child;
+ if (table && !table->child) {
+ WARN_ON(1);
+ goto out;
}
- return table;
-}
-static struct ctl_table *proc_sys_lookup_entry(struct dentry *dparent,
- struct qstr *name,
- struct ctl_table *table)
-{
- table = proc_sys_lookup_table(dparent, table);
- if (table)
- table = proc_sys_lookup_table_one(table, name);
- return table;
-}
+ table = table ? table->child : head->ctl_table;
-static struct ctl_table *do_proc_sys_lookup(struct dentry *parent,
- struct qstr *name,
- struct ctl_table_header **ptr)
-{
- struct ctl_table_header *head;
- struct ctl_table *table = NULL;
-
- for (head = sysctl_head_next(NULL); head;
- head = sysctl_head_next(head)) {
- table = proc_sys_lookup_entry(parent, name, head->ctl_table);
- if (table)
- break;
+ p = find_in_table(table, name);
+ if (!p) {
+ for (h = sysctl_head_next(NULL); h; h = sysctl_head_next(h)) {
+ if (h->attached_to != table)
+ continue;
+ p = find_in_table(h->attached_by, name);
+ if (p)
+ break;
+ }
}
- *ptr = head;
- return table;
-}
-
-static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
- struct nameidata *nd)
-{
- struct ctl_table_header *head;
- struct inode *inode;
- struct dentry *err;
- struct ctl_table *table;
- err = ERR_PTR(-ENOENT);
- table = do_proc_sys_lookup(dentry->d_parent, &dentry->d_name, &head);
- if (!table)
+ if (!p)
goto out;
err = ERR_PTR(-ENOMEM);
- inode = proc_sys_make_inode(dir, table);
+ inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p);
+ if (h)
+ sysctl_head_finish(h);
+
if (!inode)
goto out;
@@ -168,22 +129,14 @@ out:
static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
size_t count, loff_t *ppos, int write)
{
- struct dentry *dentry = filp->f_dentry;
- struct ctl_table_header *head;
- struct ctl_table *table;
+ struct inode *inode = filp->f_path.dentry->d_inode;
+ struct ctl_table_header *head = grab_header(inode);
+ struct ctl_table *table = PROC_I(inode)->sysctl_entry;
ssize_t error;
size_t res;
- table = do_proc_sys_lookup(dentry->d_parent, &dentry->d_name, &head);
- /* Has the sysctl entry disappeared on us? */
- error = -ENOENT;
- if (!table)
- goto out;
-
- /* Has the sysctl entry been replaced by a directory? */
- error = -EISDIR;
- if (!table->proc_handler)
- goto out;
+ if (IS_ERR(head))
+ return PTR_ERR(head);
/*
* At this point we know that the sysctl was not unregistered
@@ -193,6 +146,11 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
if (sysctl_perm(head->root, table, write ? MAY_WRITE : MAY_READ))
goto out;
+ /* if that can happen at all, it should be -EINVAL, not -EISDIR */
+ error = -EINVAL;
+ if (!table->proc_handler)
+ goto out;
+
/* careful: calling conventions are nasty here */
res = count;
error = table->proc_handler(table, write, filp, buf, &res, ppos);
@@ -218,82 +176,86 @@ static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
static int proc_sys_fill_cache(struct file *filp, void *dirent,
- filldir_t filldir, struct ctl_table *table)
+ filldir_t filldir,
+ struct ctl_table_header *head,
+ struct ctl_table *table)
{
- struct ctl_table_header *head;
- struct ctl_table *child_table = NULL;
struct dentry *child, *dir = filp->f_path.dentry;
struct inode *inode;
struct qstr qname;
ino_t ino = 0;
unsigned type = DT_UNKNOWN;
- int ret;
qname.name = table->procname;
qname.len = strlen(table->procname);
qname.hash = full_name_hash(qname.name, qname.len);
- /* Suppress duplicates.
- * Only fill a directory entry if it is the value that
- * an ordinary lookup of that name returns. Hide all
- * others.
- *
- * If we ever cache this translation in the dcache
- * I should do a dcache lookup first. But for now
- * it is just simpler not to.
- */
- ret = 0;
- child_table = do_proc_sys_lookup(dir, &qname, &head);
- sysctl_head_finish(head);
- if (child_table != table)
- return 0;
-
child = d_lookup(dir, &qname);
if (!child) {
- struct dentry *new;
- new = d_alloc(dir, &qname);
- if (new) {
- inode = proc_sys_make_inode(dir->d_inode, table);
- if (!inode)
- child = ERR_PTR(-ENOMEM);
- else {
- new->d_op = &proc_sys_dentry_operations;
- d_add(new, inode);
+ child = d_alloc(dir, &qname);
+ if (child) {
+ inode = proc_sys_make_inode(dir->d_sb, head, table);
+ if (!inode) {
+ dput(child);
+ return -ENOMEM;
+ } else {
+ child->d_op = &proc_sys_dentry_operations;
+ d_add(child, inode);
}
- if (child)
- dput(new);
- else
- child = new;
+ } else {
+ return -ENOMEM;
}
}
- if (!child || IS_ERR(child) || !child->d_inode)
- goto end_instantiate;
inode = child->d_inode;
- if (inode) {
- ino = inode->i_ino;
- type = inode->i_mode >> 12;
- }
+ ino = inode->i_ino;
+ type = inode->i_mode >> 12;
dput(child);
-end_instantiate:
- if (!ino)
- ino= find_inode_number(dir, &qname);
- if (!ino)
- ino = 1;
- return filldir(dirent, qname.name, qname.len, filp->f_pos, ino, type);
+ return !!filldir(dirent, qname.name, qname.len, filp->f_pos, ino, type);
+}
+
+static int scan(struct ctl_table_header *head, ctl_table *table,
+ unsigned long *pos, struct file *file,
+ void *dirent, filldir_t filldir)
+{
+
+ for (; table->ctl_name || table->procname; table++, (*pos)++) {
+ int res;
+
+ /* Can't do anything without a proc name */
+ if (!table->procname)
+ continue;
+
+ if (*pos < file->f_pos)
+ continue;
+
+ res = proc_sys_fill_cache(file, dirent, filldir, head, table);
+ if (res)
+ return res;
+
+ file->f_pos = *pos + 1;
+ }
+ return 0;
}
static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir)
{
- struct dentry *dentry = filp->f_dentry;
+ struct dentry *dentry = filp->f_path.dentry;
struct inode *inode = dentry->d_inode;
- struct ctl_table_header *head = NULL;
- struct ctl_table *table;
+ struct ctl_table_header *head = grab_header(inode);
+ struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+ struct ctl_table_header *h = NULL;
unsigned long pos;
- int ret;
+ int ret = -EINVAL;
- ret = -ENOTDIR;
- if (!S_ISDIR(inode->i_mode))
+ if (IS_ERR(head))
+ return PTR_ERR(head);
+
+ if (table && !table->child) {
+ WARN_ON(1);
goto out;
+ }
+
+ table = table ? table->child : head->ctl_table;
ret = 0;
/* Avoid a switch here: arm builds fail with missing __cmpdi2 */
@@ -311,30 +273,17 @@ static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir)
}
pos = 2;
- /* - Find each instance of the directory
- * - Read all entries in each instance
- * - Before returning an entry to user space lookup the entry
- * by name and if I find a different entry don't return
- * this one because it means it is a buried dup.
- * For sysctl this should only happen for directory entries.
- */
- for (head = sysctl_head_next(NULL); head; head = sysctl_head_next(head)) {
- table = proc_sys_lookup_table(dentry, head->ctl_table);
+ ret = scan(head, table, &pos, filp, dirent, filldir);
+ if (ret)
+ goto out;
- if (!table)
+ for (h = sysctl_head_next(NULL); h; h = sysctl_head_next(h)) {
+ if (h->attached_to != table)
continue;
-
- for (; table->ctl_name || table->procname; table++, pos++) {
- /* Can't do anything without a proc name */
- if (!table->procname)
- continue;
-
- if (pos < filp->f_pos)
- continue;
-
- if (proc_sys_fill_cache(filp, dirent, filldir, table) < 0)
- goto out;
- filp->f_pos = pos + 1;
+ ret = scan(h, h->attached_by, &pos, filp, dirent, filldir);
+ if (ret) {
+ sysctl_head_finish(h);
+ break;
}
}
ret = 1;
@@ -343,53 +292,24 @@ out:
return ret;
}
-static int proc_sys_permission(struct inode *inode, int mask, struct nameidata *nd)
+static int proc_sys_permission(struct inode *inode, int mask)
{
/*
* sysctl entries that are not writeable,
* are _NOT_ writeable, capabilities or not.
*/
- struct ctl_table_header *head;
- struct ctl_table *table;
- struct dentry *dentry;
- int mode;
- int depth;
+ struct ctl_table_header *head = grab_header(inode);
+ struct ctl_table *table = PROC_I(inode)->sysctl_entry;
int error;
- head = NULL;
- depth = PROC_I(inode)->fd;
-
- /* First check the cached permissions, in case we don't have
- * enough information to lookup the sysctl table entry.
- */
- error = -EACCES;
- mode = inode->i_mode;
-
- if (current->euid == 0)
- mode >>= 6;
- else if (in_group_p(0))
- mode >>= 3;
-
- if ((mode & mask & (MAY_READ|MAY_WRITE|MAY_EXEC)) == mask)
- error = 0;
-
- /* If we can't get a sysctl table entry the permission
- * checks on the cached mode will have to be enough.
- */
- if (!nd || !depth)
- goto out;
-
- dentry = nd->path.dentry;
- table = do_proc_sys_lookup(dentry->d_parent, &dentry->d_name, &head);
+ if (IS_ERR(head))
+ return PTR_ERR(head);
- /* If the entry does not exist deny permission */
- error = -EACCES;
- if (!table)
- goto out;
+ if (!table) /* global root - r-xr-xr-x */
+ error = mask & MAY_WRITE ? -EACCES : 0;
+ else /* Use the permissions on the sysctl table entry */
+ error = sysctl_perm(head->root, table, mask);
- /* Use the permissions on the sysctl table entry */
- error = sysctl_perm(head->root, table, mask);
-out:
sysctl_head_finish(head);
return error;
}
@@ -409,42 +329,79 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr)
return error;
}
-/* I'm lazy and don't distinguish between files and directories,
- * until access time.
- */
+static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+ struct inode *inode = dentry->d_inode;
+ struct ctl_table_header *head = grab_header(inode);
+ struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+
+ if (IS_ERR(head))
+ return PTR_ERR(head);
+
+ generic_fillattr(inode, stat);
+ if (table)
+ stat->mode = (stat->mode & S_IFMT) | table->mode;
+
+ sysctl_head_finish(head);
+ return 0;
+}
+
static const struct file_operations proc_sys_file_operations = {
.read = proc_sys_read,
.write = proc_sys_write,
+};
+
+static const struct file_operations proc_sys_dir_file_operations = {
.readdir = proc_sys_readdir,
};
static const struct inode_operations proc_sys_inode_operations = {
+ .permission = proc_sys_permission,
+ .setattr = proc_sys_setattr,
+ .getattr = proc_sys_getattr,
+};
+
+static const struct inode_operations proc_sys_dir_operations = {
.lookup = proc_sys_lookup,
.permission = proc_sys_permission,
.setattr = proc_sys_setattr,
+ .getattr = proc_sys_getattr,
};
static int proc_sys_revalidate(struct dentry *dentry, struct nameidata *nd)
{
- struct ctl_table_header *head;
- struct ctl_table *table;
- table = do_proc_sys_lookup(dentry->d_parent, &dentry->d_name, &head);
- proc_sys_refresh_inode(dentry->d_inode, table);
- sysctl_head_finish(head);
- return !!table;
+ return !PROC_I(dentry->d_inode)->sysctl->unregistering;
+}
+
+static int proc_sys_delete(struct dentry *dentry)
+{
+ return !!PROC_I(dentry->d_inode)->sysctl->unregistering;
+}
+
+static int proc_sys_compare(struct dentry *dir, struct qstr *qstr,
+ struct qstr *name)
+{
+ struct dentry *dentry = container_of(qstr, struct dentry, d_name);
+ if (qstr->len != name->len)
+ return 1;
+ if (memcmp(qstr->name, name->name, name->len))
+ return 1;
+ return !sysctl_is_seen(PROC_I(dentry->d_inode)->sysctl);
}
static struct dentry_operations proc_sys_dentry_operations = {
.d_revalidate = proc_sys_revalidate,
+ .d_delete = proc_sys_delete,
+ .d_compare = proc_sys_compare,
};
-static struct proc_dir_entry *proc_sys_root;
-
int proc_sys_init(void)
{
+ struct proc_dir_entry *proc_sys_root;
+
proc_sys_root = proc_mkdir("sys", NULL);
- proc_sys_root->proc_iops = &proc_sys_inode_operations;
- proc_sys_root->proc_fops = &proc_sys_file_operations;
+ proc_sys_root->proc_iops = &proc_sys_dir_operations;
+ proc_sys_root->proc_fops = &proc_sys_dir_file_operations;
proc_sys_root->nlink = 0;
return 0;
}
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index 21f490f5d65..d153946d6d1 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -136,54 +136,6 @@ static const struct file_operations proc_tty_drivers_operations = {
.release = seq_release,
};
-static void * tty_ldiscs_seq_start(struct seq_file *m, loff_t *pos)
-{
- return (*pos < NR_LDISCS) ? pos : NULL;
-}
-
-static void * tty_ldiscs_seq_next(struct seq_file *m, void *v, loff_t *pos)
-{
- (*pos)++;
- return (*pos < NR_LDISCS) ? pos : NULL;
-}
-
-static void tty_ldiscs_seq_stop(struct seq_file *m, void *v)
-{
-}
-
-static int tty_ldiscs_seq_show(struct seq_file *m, void *v)
-{
- int i = *(loff_t *)v;
- struct tty_ldisc *ld;
-
- ld = tty_ldisc_get(i);
- if (ld == NULL)
- return 0;
- seq_printf(m, "%-10s %2d\n", ld->name ? ld->name : "???", i);
- tty_ldisc_put(i);
- return 0;
-}
-
-static const struct seq_operations tty_ldiscs_seq_ops = {
- .start = tty_ldiscs_seq_start,
- .next = tty_ldiscs_seq_next,
- .stop = tty_ldiscs_seq_stop,
- .show = tty_ldiscs_seq_show,
-};
-
-static int proc_tty_ldiscs_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &tty_ldiscs_seq_ops);
-}
-
-static const struct file_operations tty_ldiscs_proc_fops = {
- .owner = THIS_MODULE,
- .open = proc_tty_ldiscs_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
/*
* This function is called by tty_register_driver() to handle
* registering the driver's /proc handler into /proc/tty/driver/<foo>
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index ab8ccc9d14f..4806830ea2a 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -210,23 +210,20 @@ static int show_map(struct seq_file *m, void *v)
dev_t dev = 0;
int len;
- if (maps_protect && !ptrace_may_attach(task))
- return -EACCES;
-
if (file) {
struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
dev = inode->i_sb->s_dev;
ino = inode->i_ino;
}
- seq_printf(m, "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n",
+ seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
vma->vm_start,
vma->vm_end,
flags & VM_READ ? 'r' : '-',
flags & VM_WRITE ? 'w' : '-',
flags & VM_EXEC ? 'x' : '-',
flags & VM_MAYSHARE ? 's' : 'p',
- vma->vm_pgoff << PAGE_SHIFT,
+ ((loff_t)vma->vm_pgoff) << PAGE_SHIFT,
MAJOR(dev), MINOR(dev), ino, &len);
/*
@@ -476,10 +473,10 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
return -ESRCH;
mm = get_task_mm(task);
if (mm) {
- static struct mm_walk clear_refs_walk;
- memset(&clear_refs_walk, 0, sizeof(clear_refs_walk));
- clear_refs_walk.pmd_entry = clear_refs_pte_range;
- clear_refs_walk.mm = mm;
+ struct mm_walk clear_refs_walk = {
+ .pmd_entry = clear_refs_pte_range,
+ .mm = mm,
+ };
down_read(&mm->mmap_sem);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
clear_refs_walk.private = vma;
@@ -602,11 +599,6 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
return err;
}
-static struct mm_walk pagemap_walk = {
- .pmd_entry = pagemap_pte_range,
- .pte_hole = pagemap_pte_hole
-};
-
/*
* /proc/pid/pagemap - an array mapping virtual pages to pfns
*
@@ -641,12 +633,17 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
struct pagemapread pm;
int pagecount;
int ret = -ESRCH;
+ struct mm_walk pagemap_walk = {};
+ unsigned long src;
+ unsigned long svpfn;
+ unsigned long start_vaddr;
+ unsigned long end_vaddr;
if (!task)
goto out;
ret = -EACCES;
- if (!ptrace_may_attach(task))
+ if (!ptrace_may_access(task, PTRACE_MODE_READ))
goto out_task;
ret = -EINVAL;
@@ -659,11 +656,15 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
if (!mm)
goto out_task;
- ret = -ENOMEM;
+
uaddr = (unsigned long)buf & PAGE_MASK;
uend = (unsigned long)(buf + count);
pagecount = (PAGE_ALIGN(uend) - uaddr) / PAGE_SIZE;
- pages = kmalloc(pagecount * sizeof(struct page *), GFP_KERNEL);
+ ret = 0;
+ if (pagecount == 0)
+ goto out_mm;
+ pages = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
+ ret = -ENOMEM;
if (!pages)
goto out_mm;
@@ -684,33 +685,33 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
pm.out = (u64 *)buf;
pm.end = (u64 *)(buf + count);
- if (!ptrace_may_attach(task)) {
- ret = -EIO;
- } else {
- unsigned long src = *ppos;
- unsigned long svpfn = src / PM_ENTRY_BYTES;
- unsigned long start_vaddr = svpfn << PAGE_SHIFT;
- unsigned long end_vaddr = TASK_SIZE_OF(task);
-
- /* watch out for wraparound */
- if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT)
- start_vaddr = end_vaddr;
-
- /*
- * The odds are that this will stop walking way
- * before end_vaddr, because the length of the
- * user buffer is tracked in "pm", and the walk
- * will stop when we hit the end of the buffer.
- */
- ret = walk_page_range(start_vaddr, end_vaddr,
- &pagemap_walk);
- if (ret == PM_END_OF_BUFFER)
- ret = 0;
- /* don't need mmap_sem for these, but this looks cleaner */
- *ppos += (char *)pm.out - buf;
- if (!ret)
- ret = (char *)pm.out - buf;
- }
+ pagemap_walk.pmd_entry = pagemap_pte_range;
+ pagemap_walk.pte_hole = pagemap_pte_hole;
+ pagemap_walk.mm = mm;
+ pagemap_walk.private = &pm;
+
+ src = *ppos;
+ svpfn = src / PM_ENTRY_BYTES;
+ start_vaddr = svpfn << PAGE_SHIFT;
+ end_vaddr = TASK_SIZE_OF(task);
+
+ /* watch out for wraparound */
+ if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT)
+ start_vaddr = end_vaddr;
+
+ /*
+ * The odds are that this will stop walking way
+ * before end_vaddr, because the length of the
+ * user buffer is tracked in "pm", and the walk
+ * will stop when we hit the end of the buffer.
+ */
+ ret = walk_page_range(start_vaddr, end_vaddr, &pagemap_walk);
+ if (ret == PM_END_OF_BUFFER)
+ ret = 0;
+ /* don't need mmap_sem for these, but this looks cleaner */
+ *ppos += (char *)pm.out - buf;
+ if (!ret)
+ ret = (char *)pm.out - buf;
out_pages:
for (; pagecount; pagecount--) {
@@ -738,22 +739,11 @@ const struct file_operations proc_pagemap_operations = {
#ifdef CONFIG_NUMA
extern int show_numa_map(struct seq_file *m, void *v);
-static int show_numa_map_checked(struct seq_file *m, void *v)
-{
- struct proc_maps_private *priv = m->private;
- struct task_struct *task = priv->task;
-
- if (maps_protect && !ptrace_may_attach(task))
- return -EACCES;
-
- return show_numa_map(m, v);
-}
-
static const struct seq_operations proc_pid_numa_maps_op = {
.start = m_start,
.next = m_next,
.stop = m_stop,
- .show = show_numa_map_checked
+ .show = show_numa_map,
};
static int numa_maps_open(struct inode *inode, struct file *file)
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 4b4f9cc2f18..219bd79ea89 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -110,11 +110,6 @@ int task_statm(struct mm_struct *mm, int *shared, int *text,
static int show_map(struct seq_file *m, void *_vml)
{
struct vm_list_struct *vml = _vml;
- struct proc_maps_private *priv = m->private;
- struct task_struct *task = priv->task;
-
- if (maps_protect && !ptrace_may_attach(task))
- return -EACCES;
return nommu_vma_show(m, vml->vma);
}
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 9ac0f5e064e..841368b87a2 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -165,14 +165,8 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
return acc;
}
-static int open_vmcore(struct inode *inode, struct file *filp)
-{
- return 0;
-}
-
const struct file_operations proc_vmcore_operations = {
.read = read_vmcore,
- .open = open_vmcore,
};
static struct vmcore* __init get_new_element(void)
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index b31ab78052b..2aad1044b84 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -553,7 +553,7 @@ static void qnx4_destroy_inode(struct inode *inode)
kmem_cache_free(qnx4_inode_cachep, qnx4_i(inode));
}
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
{
struct qnx4_inode_info *ei = (struct qnx4_inode_info *) foo;
diff --git a/fs/quota.c b/fs/quota.c
index db1cc9f3c7a..7f4386ebc23 100644
--- a/fs/quota.c
+++ b/fs/quota.c
@@ -186,7 +186,7 @@ static void quota_sync_sb(struct super_block *sb, int type)
void sync_dquots(struct super_block *sb, int type)
{
- int cnt, dirty;
+ int cnt;
if (sb) {
if (sb->s_qcop->quota_sync)
@@ -198,11 +198,17 @@ void sync_dquots(struct super_block *sb, int type)
restart:
list_for_each_entry(sb, &super_blocks, s_list) {
/* This test just improves performance so it needn't be reliable... */
- for (cnt = 0, dirty = 0; cnt < MAXQUOTAS; cnt++)
- if ((type == cnt || type == -1) && sb_has_quota_enabled(sb, cnt)
- && info_any_dirty(&sb_dqopt(sb)->info[cnt]))
- dirty = 1;
- if (!dirty)
+ for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+ if (type != -1 && type != cnt)
+ continue;
+ if (!sb_has_quota_enabled(sb, cnt))
+ continue;
+ if (!info_dirty(&sb_dqopt(sb)->info[cnt]) &&
+ list_empty(&sb_dqopt(sb)->info[cnt].dqi_dirty_list))
+ continue;
+ break;
+ }
+ if (cnt == MAXQUOTAS)
continue;
sb->s_count++;
spin_unlock(&sb_lock);
diff --git a/fs/quota_v1.c b/fs/quota_v1.c
index a6cf9269105..5ae15b13eeb 100644
--- a/fs/quota_v1.c
+++ b/fs/quota_v1.c
@@ -1,6 +1,7 @@
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/quota.h>
+#include <linux/quotaops.h>
#include <linux/dqblk_v1.h>
#include <linux/quotaio_v1.h>
#include <linux/kernel.h>
diff --git a/fs/quota_v2.c b/fs/quota_v2.c
index 234ada90363..b53827dc02d 100644
--- a/fs/quota_v2.c
+++ b/fs/quota_v2.c
@@ -11,6 +11,7 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/slab.h>
+#include <linux/quotaops.h>
#include <asm/byteorder.h>
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 9590b902430..78f613cb9c7 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -45,6 +45,7 @@ const struct file_operations ramfs_file_operations = {
.mmap = generic_file_mmap,
.fsync = simple_sync_file,
.splice_read = generic_file_splice_read,
+ .splice_write = generic_file_splice_write,
.llseek = generic_file_llseek,
};
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 0989bc2c2f6..5145cb9125a 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -43,6 +43,7 @@ const struct file_operations ramfs_file_operations = {
.aio_write = generic_file_aio_write,
.fsync = simple_sync_file,
.splice_read = generic_file_splice_read,
+ .splice_write = generic_file_splice_write,
.llseek = generic_file_llseek,
};
@@ -57,7 +58,7 @@ const struct inode_operations ramfs_file_inode_operations = {
* size 0 on the assumption that it's going to be used for an mmap of shared
* memory
*/
-static int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
+int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
{
struct pagevec lru_pvec;
unsigned long npages, xpages, loop, limit;
diff --git a/fs/read_write.c b/fs/read_write.c
index f0d1240a5c6..9ba495d5a29 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -31,12 +31,12 @@ const struct file_operations generic_ro_fops = {
EXPORT_SYMBOL(generic_ro_fops);
-loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
+loff_t
+generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
{
loff_t retval;
struct inode *inode = file->f_mapping->host;
- mutex_lock(&inode->i_mutex);
switch (origin) {
case SEEK_END:
offset += inode->i_size;
@@ -46,42 +46,26 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
}
retval = -EINVAL;
if (offset>=0 && offset<=inode->i_sb->s_maxbytes) {
+ /* Special lock needed here? */
if (offset != file->f_pos) {
file->f_pos = offset;
file->f_version = 0;
}
retval = offset;
}
- mutex_unlock(&inode->i_mutex);
return retval;
}
+EXPORT_SYMBOL(generic_file_llseek_unlocked);
-EXPORT_SYMBOL(generic_file_llseek);
-
-loff_t remote_llseek(struct file *file, loff_t offset, int origin)
+loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
{
- loff_t retval;
-
- lock_kernel();
- switch (origin) {
- case SEEK_END:
- offset += i_size_read(file->f_path.dentry->d_inode);
- break;
- case SEEK_CUR:
- offset += file->f_pos;
- }
- retval = -EINVAL;
- if (offset>=0 && offset<=file->f_path.dentry->d_inode->i_sb->s_maxbytes) {
- if (offset != file->f_pos) {
- file->f_pos = offset;
- file->f_version = 0;
- }
- retval = offset;
- }
- unlock_kernel();
- return retval;
+ loff_t n;
+ mutex_lock(&file->f_dentry->d_inode->i_mutex);
+ n = generic_file_llseek_unlocked(file, offset, origin);
+ mutex_unlock(&file->f_dentry->d_inode->i_mutex);
+ return n;
}
-EXPORT_SYMBOL(remote_llseek);
+EXPORT_SYMBOL(generic_file_llseek);
loff_t no_llseek(struct file *file, loff_t offset, int origin)
{
diff --git a/fs/readdir.c b/fs/readdir.c
index 4e026e5407f..93a7559bbfd 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -80,8 +80,10 @@ static int fillonedir(void * __buf, const char * name, int namlen, loff_t offset
if (buf->result)
return -EINVAL;
d_ino = ino;
- if (sizeof(d_ino) < sizeof(ino) && d_ino != ino)
+ if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
+ buf->result = -EOVERFLOW;
return -EOVERFLOW;
+ }
buf->result++;
dirent = buf->dirent;
if (!access_ok(VERIFY_WRITE, dirent,
@@ -155,8 +157,10 @@ static int filldir(void * __buf, const char * name, int namlen, loff_t offset,
if (reclen > buf->count)
return -EINVAL;
d_ino = ino;
- if (sizeof(d_ino) < sizeof(ino) && d_ino != ino)
+ if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
+ buf->error = -EOVERFLOW;
return -EOVERFLOW;
+ }
dirent = buf->previous;
if (dirent) {
if (__put_user(offset, &dirent->d_off))
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 57917932212..5699171212a 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -45,6 +45,8 @@ void reiserfs_delete_inode(struct inode *inode)
goto out;
reiserfs_update_inode_transaction(inode);
+ reiserfs_discard_prealloc(&th, inode);
+
err = reiserfs_delete_object(&th, inode);
/* Do quota update inside a transaction for journaled quotas. We must do that
@@ -2433,7 +2435,7 @@ static int reiserfs_write_full_page(struct page *page,
if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
lock_buffer(bh);
} else {
- if (test_set_buffer_locked(bh)) {
+ if (!trylock_buffer(bh)) {
redirty_page_for_writepage(wbc, page);
continue;
}
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index e396b2fa474..c21df71943a 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -34,15 +34,10 @@
** from within kupdate, it will ignore the immediate flag
*/
-#include <asm/uaccess.h>
-#include <asm/system.h>
-
#include <linux/time.h>
#include <linux/semaphore.h>
-
#include <linux/vmalloc.h>
#include <linux/reiserfs_fs.h>
-
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/fcntl.h>
@@ -54,6 +49,9 @@
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
+#include <linux/uaccess.h>
+
+#include <asm/system.h>
/* gets a struct reiserfs_journal_list * from a list head */
#define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
@@ -558,13 +556,13 @@ static inline void insert_journal_hash(struct reiserfs_journal_cnode **table,
static inline void lock_journal(struct super_block *p_s_sb)
{
PROC_INFO_INC(p_s_sb, journal.lock_journal);
- down(&SB_JOURNAL(p_s_sb)->j_lock);
+ mutex_lock(&SB_JOURNAL(p_s_sb)->j_mutex);
}
/* unlock the current transaction */
static inline void unlock_journal(struct super_block *p_s_sb)
{
- up(&SB_JOURNAL(p_s_sb)->j_lock);
+ mutex_unlock(&SB_JOURNAL(p_s_sb)->j_mutex);
}
static inline void get_journal_list(struct reiserfs_journal_list *jl)
@@ -629,7 +627,7 @@ static int journal_list_still_alive(struct super_block *s,
static void release_buffer_page(struct buffer_head *bh)
{
struct page *page = bh->b_page;
- if (!page->mapping && !TestSetPageLocked(page)) {
+ if (!page->mapping && trylock_page(page)) {
page_cache_get(page);
put_bh(bh);
if (!page->mapping)
@@ -857,7 +855,7 @@ static int write_ordered_buffers(spinlock_t * lock,
jh = JH_ENTRY(list->next);
bh = jh->bh;
get_bh(bh);
- if (test_set_buffer_locked(bh)) {
+ if (!trylock_buffer(bh)) {
if (!buffer_dirty(bh)) {
list_move(&jh->list, &tmp);
goto loop_next;
@@ -1045,9 +1043,9 @@ static int flush_commit_list(struct super_block *s,
}
/* make sure nobody is trying to flush this one at the same time */
- down(&jl->j_commit_lock);
+ mutex_lock(&jl->j_commit_mutex);
if (!journal_list_still_alive(s, trans_id)) {
- up(&jl->j_commit_lock);
+ mutex_unlock(&jl->j_commit_mutex);
goto put_jl;
}
BUG_ON(jl->j_trans_id == 0);
@@ -1057,7 +1055,7 @@ static int flush_commit_list(struct super_block *s,
if (flushall) {
atomic_set(&(jl->j_older_commits_done), 1);
}
- up(&jl->j_commit_lock);
+ mutex_unlock(&jl->j_commit_mutex);
goto put_jl;
}
@@ -1181,7 +1179,7 @@ static int flush_commit_list(struct super_block *s,
if (flushall) {
atomic_set(&(jl->j_older_commits_done), 1);
}
- up(&jl->j_commit_lock);
+ mutex_unlock(&jl->j_commit_mutex);
put_jl:
put_journal_list(s, jl);
@@ -1411,8 +1409,8 @@ static int flush_journal_list(struct super_block *s,
/* if flushall == 0, the lock is already held */
if (flushall) {
- down(&journal->j_flush_sem);
- } else if (!down_trylock(&journal->j_flush_sem)) {
+ mutex_lock(&journal->j_flush_mutex);
+ } else if (mutex_trylock(&journal->j_flush_mutex)) {
BUG();
}
@@ -1642,7 +1640,7 @@ static int flush_journal_list(struct super_block *s,
jl->j_state = 0;
put_journal_list(s, jl);
if (flushall)
- up(&journal->j_flush_sem);
+ mutex_unlock(&journal->j_flush_mutex);
put_fs_excl();
return err;
}
@@ -1772,12 +1770,12 @@ static int kupdate_transactions(struct super_block *s,
struct reiserfs_journal *journal = SB_JOURNAL(s);
chunk.nr = 0;
- down(&journal->j_flush_sem);
+ mutex_lock(&journal->j_flush_mutex);
if (!journal_list_still_alive(s, orig_trans_id)) {
goto done;
}
- /* we've got j_flush_sem held, nobody is going to delete any
+ /* we've got j_flush_mutex held, nobody is going to delete any
* of these lists out from underneath us
*/
while ((num_trans && transactions_flushed < num_trans) ||
@@ -1812,7 +1810,7 @@ static int kupdate_transactions(struct super_block *s,
}
done:
- up(&journal->j_flush_sem);
+ mutex_unlock(&journal->j_flush_mutex);
return ret;
}
@@ -2556,7 +2554,7 @@ static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)
INIT_LIST_HEAD(&jl->j_working_list);
INIT_LIST_HEAD(&jl->j_tail_bh_list);
INIT_LIST_HEAD(&jl->j_bh_list);
- sema_init(&jl->j_commit_lock, 1);
+ mutex_init(&jl->j_commit_mutex);
SB_JOURNAL(s)->j_num_lists++;
get_journal_list(jl);
return jl;
@@ -2837,8 +2835,8 @@ int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
journal->j_last = NULL;
journal->j_first = NULL;
init_waitqueue_head(&(journal->j_join_wait));
- sema_init(&journal->j_lock, 1);
- sema_init(&journal->j_flush_sem, 1);
+ mutex_init(&journal->j_mutex);
+ mutex_init(&journal->j_flush_mutex);
journal->j_trans_id = 10;
journal->j_mount_id = 10;
@@ -3873,7 +3871,7 @@ int reiserfs_prepare_for_journal(struct super_block *p_s_sb,
{
PROC_INFO_INC(p_s_sb, journal.prepare);
- if (test_set_buffer_locked(bh)) {
+ if (!trylock_buffer(bh)) {
if (!wait)
return 0;
lock_buffer(bh);
@@ -4030,7 +4028,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
* the new transaction is fully setup, and we've already flushed the
* ordered bh list
*/
- down(&jl->j_commit_lock);
+ mutex_lock(&jl->j_commit_mutex);
/* save the transaction id in case we need to commit it later */
commit_trans_id = jl->j_trans_id;
@@ -4196,7 +4194,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
lock_kernel();
}
BUG_ON(!list_empty(&jl->j_tail_bh_list));
- up(&jl->j_commit_lock);
+ mutex_unlock(&jl->j_commit_mutex);
/* honor the flush wishes from the caller, simple commits can
** be done outside the journal lock, they are done below
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index ed424d708e6..d318c7e663f 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -22,11 +22,11 @@
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/exportfs.h>
+#include <linux/quotaops.h>
#include <linux/vfs.h>
#include <linux/mnt_namespace.h>
#include <linux/mount.h>
#include <linux/namei.h>
-#include <linux/quotaops.h>
struct file_system_type reiserfs_fs_type;
@@ -182,7 +182,7 @@ static int finish_unfinished(struct super_block *s)
int ret = reiserfs_quota_on_mount(s, i);
if (ret < 0)
reiserfs_warning(s,
- "reiserfs: cannot turn on journalled quota: error %d",
+ "reiserfs: cannot turn on journaled quota: error %d",
ret);
}
}
@@ -520,7 +520,7 @@ static void reiserfs_destroy_inode(struct inode *inode)
kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode));
}
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
{
struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo;
@@ -876,7 +876,9 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin
mount options were selected. */
unsigned long *blocks, /* strtol-ed from NNN of resize=NNN */
char **jdev_name,
- unsigned int *commit_max_age)
+ unsigned int *commit_max_age,
+ char **qf_names,
+ unsigned int *qfmt)
{
int c;
char *arg = NULL;
@@ -992,9 +994,11 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin
if (c == 'u' || c == 'g') {
int qtype = c == 'u' ? USRQUOTA : GRPQUOTA;
- if (sb_any_quota_enabled(s)) {
+ if ((sb_any_quota_enabled(s) ||
+ sb_any_quota_suspended(s)) &&
+ (!*arg != !REISERFS_SB(s)->s_qf_names[qtype])) {
reiserfs_warning(s,
- "reiserfs_parse_options: cannot change journalled quota options when quota turned on.");
+ "reiserfs_parse_options: cannot change journaled quota options when quota turned on.");
return 0;
}
if (*arg) { /* Some filename specified? */
@@ -1011,46 +1015,54 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin
"reiserfs_parse_options: quotafile must be on filesystem root.");
return 0;
}
- REISERFS_SB(s)->s_qf_names[qtype] =
+ qf_names[qtype] =
kmalloc(strlen(arg) + 1, GFP_KERNEL);
- if (!REISERFS_SB(s)->s_qf_names[qtype]) {
+ if (!qf_names[qtype]) {
reiserfs_warning(s,
"reiserfs_parse_options: not enough memory for storing quotafile name.");
return 0;
}
- strcpy(REISERFS_SB(s)->s_qf_names[qtype], arg);
+ strcpy(qf_names[qtype], arg);
*mount_options |= 1 << REISERFS_QUOTA;
} else {
- kfree(REISERFS_SB(s)->s_qf_names[qtype]);
- REISERFS_SB(s)->s_qf_names[qtype] = NULL;
+ if (qf_names[qtype] !=
+ REISERFS_SB(s)->s_qf_names[qtype])
+ kfree(qf_names[qtype]);
+ qf_names[qtype] = NULL;
}
}
if (c == 'f') {
if (!strcmp(arg, "vfsold"))
- REISERFS_SB(s)->s_jquota_fmt = QFMT_VFS_OLD;
+ *qfmt = QFMT_VFS_OLD;
else if (!strcmp(arg, "vfsv0"))
- REISERFS_SB(s)->s_jquota_fmt = QFMT_VFS_V0;
+ *qfmt = QFMT_VFS_V0;
else {
reiserfs_warning(s,
"reiserfs_parse_options: unknown quota format specified.");
return 0;
}
+ if ((sb_any_quota_enabled(s) ||
+ sb_any_quota_suspended(s)) &&
+ *qfmt != REISERFS_SB(s)->s_jquota_fmt) {
+ reiserfs_warning(s,
+ "reiserfs_parse_options: cannot change journaled quota options when quota turned on.");
+ return 0;
+ }
}
#else
if (c == 'u' || c == 'g' || c == 'f') {
reiserfs_warning(s,
- "reiserfs_parse_options: journalled quota options not supported.");
+ "reiserfs_parse_options: journaled quota options not supported.");
return 0;
}
#endif
}
#ifdef CONFIG_QUOTA
- if (!REISERFS_SB(s)->s_jquota_fmt
- && (REISERFS_SB(s)->s_qf_names[USRQUOTA]
- || REISERFS_SB(s)->s_qf_names[GRPQUOTA])) {
+ if (!REISERFS_SB(s)->s_jquota_fmt && !*qfmt
+ && (qf_names[USRQUOTA] || qf_names[GRPQUOTA])) {
reiserfs_warning(s,
- "reiserfs_parse_options: journalled quota format not specified.");
+ "reiserfs_parse_options: journaled quota format not specified.");
return 0;
}
/* This checking is not precise wrt the quota type but for our purposes it is sufficient */
@@ -1130,6 +1142,21 @@ static void handle_attrs(struct super_block *s)
}
}
+#ifdef CONFIG_QUOTA
+static void handle_quota_files(struct super_block *s, char **qf_names,
+ unsigned int *qfmt)
+{
+ int i;
+
+ for (i = 0; i < MAXQUOTAS; i++) {
+ if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i])
+ kfree(REISERFS_SB(s)->s_qf_names[i]);
+ REISERFS_SB(s)->s_qf_names[i] = qf_names[i];
+ }
+ REISERFS_SB(s)->s_jquota_fmt = *qfmt;
+}
+#endif
+
static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
{
struct reiserfs_super_block *rs;
@@ -1141,23 +1168,30 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
struct reiserfs_journal *journal = SB_JOURNAL(s);
char *new_opts = kstrdup(arg, GFP_KERNEL);
int err;
+ char *qf_names[MAXQUOTAS];
+ unsigned int qfmt = 0;
#ifdef CONFIG_QUOTA
int i;
+
+ memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names));
#endif
rs = SB_DISK_SUPER_BLOCK(s);
if (!reiserfs_parse_options
- (s, arg, &mount_options, &blocks, NULL, &commit_max_age)) {
+ (s, arg, &mount_options, &blocks, NULL, &commit_max_age,
+ qf_names, &qfmt)) {
#ifdef CONFIG_QUOTA
- for (i = 0; i < MAXQUOTAS; i++) {
- kfree(REISERFS_SB(s)->s_qf_names[i]);
- REISERFS_SB(s)->s_qf_names[i] = NULL;
- }
+ for (i = 0; i < MAXQUOTAS; i++)
+ if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i])
+ kfree(qf_names[i]);
#endif
err = -EINVAL;
goto out_err;
}
+#ifdef CONFIG_QUOTA
+ handle_quota_files(s, qf_names, &qfmt);
+#endif
handle_attrs(s);
@@ -1570,6 +1604,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
char *jdev_name;
struct reiserfs_sb_info *sbi;
int errval = -EINVAL;
+ char *qf_names[MAXQUOTAS] = {};
+ unsigned int qfmt = 0;
save_mount_options(s, data);
@@ -1597,9 +1633,12 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
jdev_name = NULL;
if (reiserfs_parse_options
(s, (char *)data, &(sbi->s_mount_opt), &blocks, &jdev_name,
- &commit_max_age) == 0) {
+ &commit_max_age, qf_names, &qfmt) == 0) {
goto error;
}
+#ifdef CONFIG_QUOTA
+ handle_quota_files(s, qf_names, &qfmt);
+#endif
if (blocks) {
SWARN(silent, s, "jmacd-7: reiserfs_fill_super: resize option "
@@ -1819,7 +1858,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
return (0);
- error:
+error:
if (jinit_done) { /* kill the commit thread, free journal ram */
journal_release_error(NULL, s);
}
@@ -1830,10 +1869,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
#ifdef CONFIG_QUOTA
{
int j;
- for (j = 0; j < MAXQUOTAS; j++) {
- kfree(sbi->s_qf_names[j]);
- sbi->s_qf_names[j] = NULL;
- }
+ for (j = 0; j < MAXQUOTAS; j++)
+ kfree(qf_names[j]);
}
#endif
kfree(sbi);
@@ -1980,7 +2017,7 @@ static int reiserfs_release_dquot(struct dquot *dquot)
static int reiserfs_mark_dquot_dirty(struct dquot *dquot)
{
- /* Are we journalling quotas? */
+ /* Are we journaling quotas? */
if (REISERFS_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
REISERFS_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
dquot_mark_dquot_dirty(dquot);
@@ -2026,6 +2063,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
int err;
struct nameidata nd;
struct inode *inode;
+ struct reiserfs_transaction_handle th;
if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA)))
return -EINVAL;
@@ -2037,8 +2075,8 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
return err;
/* Quotafile not on the same filesystem? */
if (nd.path.mnt->mnt_sb != sb) {
- path_put(&nd.path);
- return -EXDEV;
+ err = -EXDEV;
+ goto out;
}
inode = nd.path.dentry->d_inode;
/* We must not pack tails for quota files on reiserfs for quota IO to work */
@@ -2048,24 +2086,37 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
reiserfs_warning(sb,
"reiserfs: Unpacking tail of quota file failed"
" (%d). Cannot turn on quotas.", err);
- path_put(&nd.path);
- return -EINVAL;
+ err = -EINVAL;
+ goto out;
}
mark_inode_dirty(inode);
}
- /* Not journalling quota? No more tests needed... */
- if (!REISERFS_SB(sb)->s_qf_names[USRQUOTA] &&
- !REISERFS_SB(sb)->s_qf_names[GRPQUOTA]) {
- path_put(&nd.path);
- return vfs_quota_on(sb, type, format_id, path, 0);
- }
- /* Quotafile not of fs root? */
- if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode)
- reiserfs_warning(sb,
+ /* Journaling quota? */
+ if (REISERFS_SB(sb)->s_qf_names[type]) {
+ /* Quotafile not of fs root? */
+ if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode)
+ reiserfs_warning(sb,
"reiserfs: Quota file not on filesystem root. "
"Journalled quota will not work.");
+ }
+
+ /*
+ * When we journal data on quota file, we have to flush journal to see
+ * all updates to the file when we bypass pagecache...
+ */
+ if (reiserfs_file_data_log(inode)) {
+ /* Just start temporary transaction and finish it */
+ err = journal_begin(&th, sb, 1);
+ if (err)
+ goto out;
+ err = journal_end_sync(&th, sb, 1);
+ if (err)
+ goto out;
+ }
+ err = vfs_quota_on_path(sb, type, format_id, &nd.path);
+out:
path_put(&nd.path);
- return vfs_quota_on(sb, type, format_id, path, 0);
+ return err;
}
/* Read data from quotafile - avoid pagecache and such because we cannot afford
@@ -2165,8 +2216,10 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
blk++;
}
out:
- if (len == towrite)
+ if (len == towrite) {
+ mutex_unlock(&inode->i_mutex);
return err;
+ }
if (inode->i_size < off + len - towrite)
i_size_write(inode, off + len - towrite);
inode->i_version++;
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index d7c4935c103..bb3cb5b7cdb 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -1250,7 +1250,7 @@ static int reiserfs_check_acl(struct inode *inode, int mask)
return error;
}
-int reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd)
+int reiserfs_permission(struct inode *inode, int mask)
{
/*
* We don't do permission checks on the internal objects.
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 5e90a95ad60..056008db137 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -6,8 +6,6 @@
#include <linux/reiserfs_xattr.h>
#include <asm/uaccess.h>
-#define XATTR_SECURITY_PREFIX "security."
-
static int
security_get(struct inode *inode, const char *name, void *buffer, size_t size)
{
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index 024a938ca60..60abe2bb1f9 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -7,8 +7,6 @@
#include <linux/reiserfs_xattr.h>
#include <asm/uaccess.h>
-#define XATTR_TRUSTED_PREFIX "trusted."
-
static int
trusted_get(struct inode *inode, const char *name, void *buffer, size_t size)
{
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index 073f39364b1..1384efcb938 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -10,8 +10,6 @@
# include <linux/reiserfs_acl.h>
#endif
-#define XATTR_USER_PREFIX "user."
-
static int
user_get(struct inode *inode, const char *name, void *buffer, size_t size)
{
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index 3f13d491c7c..60d2f822e87 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -418,7 +418,8 @@ static int
romfs_readpage(struct file *file, struct page * page)
{
struct inode *inode = page->mapping->host;
- loff_t offset, avail, readlen;
+ loff_t offset, size;
+ unsigned long filled;
void *buf;
int result = -EIO;
@@ -430,21 +431,29 @@ romfs_readpage(struct file *file, struct page * page)
/* 32 bit warning -- but not for us :) */
offset = page_offset(page);
- if (offset < i_size_read(inode)) {
- avail = inode->i_size-offset;
- readlen = min_t(unsigned long, avail, PAGE_SIZE);
- if (romfs_copyfrom(inode, buf, ROMFS_I(inode)->i_dataoffset+offset, readlen) == readlen) {
- if (readlen < PAGE_SIZE) {
- memset(buf + readlen,0,PAGE_SIZE-readlen);
- }
- SetPageUptodate(page);
- result = 0;
+ size = i_size_read(inode);
+ filled = 0;
+ result = 0;
+ if (offset < size) {
+ unsigned long readlen;
+
+ size -= offset;
+ readlen = size > PAGE_SIZE ? PAGE_SIZE : size;
+
+ filled = romfs_copyfrom(inode, buf, ROMFS_I(inode)->i_dataoffset+offset, readlen);
+
+ if (filled != readlen) {
+ SetPageError(page);
+ filled = 0;
+ result = -EIO;
}
}
- if (result) {
- memset(buf, 0, PAGE_SIZE);
- SetPageError(page);
- }
+
+ if (filled < PAGE_SIZE)
+ memset(buf + filled, 0, PAGE_SIZE-filled);
+
+ if (!result)
+ SetPageUptodate(page);
flush_dcache_page(page);
unlock_page(page);
@@ -577,7 +586,7 @@ static void romfs_destroy_inode(struct inode *inode)
kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
}
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
{
struct romfs_inode_info *ei = foo;
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 3f54dbd6c49..bd20f7f5a93 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -108,9 +108,9 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
goto Done;
}
/* we need at least one record in buffer */
+ pos = m->index;
+ p = m->op->start(m, &pos);
while (1) {
- pos = m->index;
- p = m->op->start(m, &pos);
err = PTR_ERR(p);
if (!p || IS_ERR(p))
break;
@@ -119,6 +119,11 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
break;
if (unlikely(err))
m->count = 0;
+ if (unlikely(!m->count)) {
+ p = m->op->next(m, p, &pos);
+ m->index = pos;
+ continue;
+ }
if (m->count < m->size)
goto Fill;
m->op->stop(m, p);
@@ -128,6 +133,8 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
goto Enomem;
m->count = 0;
m->version = 0;
+ pos = m->index;
+ p = m->op->start(m, &pos);
}
m->op->stop(m, p);
m->count = 0;
@@ -443,6 +450,20 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc)
return -1;
}
+int seq_bitmap(struct seq_file *m, unsigned long *bits, unsigned int nr_bits)
+{
+ size_t len = bitmap_scnprintf_len(nr_bits);
+
+ if (m->count + len < m->size) {
+ bitmap_scnprintf(m->buf + m->count, m->size - m->count,
+ bits, nr_bits);
+ m->count += len;
+ return 0;
+ }
+ m->count = m->size;
+ return -1;
+}
+
static void *single_start(struct seq_file *p, loff_t *pos)
{
return NULL + (*pos == 0);
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 619725644c7..9c39bc7f843 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -205,11 +205,19 @@ static const struct file_operations signalfd_fops = {
.read = signalfd_read,
};
-asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemask)
+asmlinkage long sys_signalfd4(int ufd, sigset_t __user *user_mask,
+ size_t sizemask, int flags)
{
sigset_t sigmask;
struct signalfd_ctx *ctx;
+ /* Check the SFD_* constants for consistency. */
+ BUILD_BUG_ON(SFD_CLOEXEC != O_CLOEXEC);
+ BUILD_BUG_ON(SFD_NONBLOCK != O_NONBLOCK);
+
+ if (flags & ~(SFD_CLOEXEC | SFD_NONBLOCK))
+ return -EINVAL;
+
if (sizemask != sizeof(sigset_t) ||
copy_from_user(&sigmask, user_mask, sizeof(sigmask)))
return -EINVAL;
@@ -227,7 +235,8 @@ asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemas
* When we call this, the initialization must be complete, since
* anon_inode_getfd() will install the fd.
*/
- ufd = anon_inode_getfd("[signalfd]", &signalfd_fops, ctx);
+ ufd = anon_inode_getfd("[signalfd]", &signalfd_fops, ctx,
+ flags & (O_CLOEXEC | O_NONBLOCK));
if (ufd < 0)
kfree(ctx);
} else {
@@ -249,3 +258,9 @@ asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemas
return ufd;
}
+
+asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask,
+ size_t sizemask)
+{
+ return sys_signalfd4(ufd, user_mask, sizemask, 0);
+}
diff --git a/fs/smbfs/cache.c b/fs/smbfs/cache.c
index 8182f0542a2..8c177eb7e34 100644
--- a/fs/smbfs/cache.c
+++ b/fs/smbfs/cache.c
@@ -13,7 +13,6 @@
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/mm.h>
-#include <linux/dirent.h>
#include <linux/smb_fs.h>
#include <linux/pagemap.h>
#include <linux/net.h>
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index efbe29af3d7..e4f8d51a555 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -408,7 +408,7 @@ smb_file_release(struct inode *inode, struct file * file)
* privileges, so we need our own check for this.
*/
static int
-smb_file_permission(struct inode *inode, int mask, struct nameidata *nd)
+smb_file_permission(struct inode *inode, int mask)
{
int mode = inode->i_mode;
int error = 0;
@@ -417,14 +417,23 @@ smb_file_permission(struct inode *inode, int mask, struct nameidata *nd)
/* Look at user permissions */
mode >>= 6;
- if ((mode & 7 & mask) != mask)
+ if (mask & ~mode & (MAY_READ | MAY_WRITE | MAY_EXEC))
error = -EACCES;
return error;
}
+static loff_t smb_remote_llseek(struct file *file, loff_t offset, int origin)
+{
+ loff_t ret;
+ lock_kernel();
+ ret = generic_file_llseek_unlocked(file, offset, origin);
+ unlock_kernel();
+ return ret;
+}
+
const struct file_operations smb_file_operations =
{
- .llseek = remote_llseek,
+ .llseek = smb_remote_llseek,
.read = do_sync_read,
.aio_read = smb_file_aio_read,
.write = do_sync_write,
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 376ef3ee6ed..3528f40ffb0 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -67,7 +67,7 @@ static void smb_destroy_inode(struct inode *inode)
kmem_cache_free(smb_inode_cachep, SMB_I(inode));
}
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
{
struct smb_inode_info *ei = (struct smb_inode_info *) foo;
diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c
index d517a27b7f4..ee536e8a649 100644
--- a/fs/smbfs/proc.c
+++ b/fs/smbfs/proc.c
@@ -16,7 +16,6 @@
#include <linux/stat.h>
#include <linux/fcntl.h>
#include <linux/dcache.h>
-#include <linux/dirent.h>
#include <linux/nls.h>
#include <linux/smp_lock.h>
#include <linux/net.h>
diff --git a/fs/splice.c b/fs/splice.c
index aa5f6f60b30..a1e701c2715 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -371,7 +371,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
* for an in-flight io page
*/
if (flags & SPLICE_F_NONBLOCK) {
- if (TestSetPageLocked(page)) {
+ if (!trylock_page(page)) {
error = -EAGAIN;
break;
}
@@ -379,13 +379,22 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
lock_page(page);
/*
- * page was truncated, stop here. if this isn't the
- * first page, we'll just complete what we already
- * added
+ * Page was truncated, or invalidated by the
+ * filesystem. Redo the find/create, but this time the
+ * page is kept locked, so there's no chance of another
+ * race with truncate/invalidate.
*/
if (!page->mapping) {
unlock_page(page);
- break;
+ page = find_or_create_page(mapping, index,
+ mapping_gfp_mask(mapping));
+
+ if (!page) {
+ error = -ENOMEM;
+ break;
+ }
+ page_cache_release(pages[page_nr]);
+ pages[page_nr] = page;
}
/*
* page was already under io and is now done, great
@@ -763,7 +772,7 @@ generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out,
ssize_t ret;
int err;
- err = remove_suid(out->f_path.dentry);
+ err = file_remove_suid(out);
if (unlikely(err))
return err;
@@ -821,7 +830,7 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
ssize_t ret;
inode_double_lock(inode, pipe->inode);
- ret = remove_suid(out->f_path.dentry);
+ ret = file_remove_suid(out);
if (likely(!ret))
ret = __splice_from_pipe(pipe, &sd, pipe_to_file);
inode_double_unlock(inode, pipe->inode);
@@ -889,6 +898,9 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
if (unlikely(!(out->f_mode & FMODE_WRITE)))
return -EBADF;
+ if (unlikely(out->f_flags & O_APPEND))
+ return -EINVAL;
+
ret = rw_verify_area(WRITE, out, ppos, len);
if (unlikely(ret < 0))
return ret;
@@ -1152,36 +1164,6 @@ static long do_splice(struct file *in, loff_t __user *off_in,
}
/*
- * Do a copy-from-user while holding the mmap_semaphore for reading, in a
- * manner safe from deadlocking with simultaneous mmap() (grabbing mmap_sem
- * for writing) and page faulting on the user memory pointed to by src.
- * This assumes that we will very rarely hit the partial != 0 path, or this
- * will not be a win.
- */
-static int copy_from_user_mmap_sem(void *dst, const void __user *src, size_t n)
-{
- int partial;
-
- if (!access_ok(VERIFY_READ, src, n))
- return -EFAULT;
-
- pagefault_disable();
- partial = __copy_from_user_inatomic(dst, src, n);
- pagefault_enable();
-
- /*
- * Didn't copy everything, drop the mmap_sem and do a faulting copy
- */
- if (unlikely(partial)) {
- up_read(&current->mm->mmap_sem);
- partial = copy_from_user(dst, src, n);
- down_read(&current->mm->mmap_sem);
- }
-
- return partial;
-}
-
-/*
* Map an iov into an array of pages and offset/length tupples. With the
* partial_page structure, we can map several non-contiguous ranges into
* our ones pages[] map instead of splitting that operation into pieces.
@@ -1194,8 +1176,6 @@ static int get_iovec_page_array(const struct iovec __user *iov,
{
int buffers = 0, error = 0;
- down_read(&current->mm->mmap_sem);
-
while (nr_vecs) {
unsigned long off, npages;
struct iovec entry;
@@ -1204,7 +1184,7 @@ static int get_iovec_page_array(const struct iovec __user *iov,
int i;
error = -EFAULT;
- if (copy_from_user_mmap_sem(&entry, iov, sizeof(entry)))
+ if (copy_from_user(&entry, iov, sizeof(entry)))
break;
base = entry.iov_base;
@@ -1238,9 +1218,8 @@ static int get_iovec_page_array(const struct iovec __user *iov,
if (npages > PIPE_BUFFERS - buffers)
npages = PIPE_BUFFERS - buffers;
- error = get_user_pages(current, current->mm,
- (unsigned long) base, npages, 0, 0,
- &pages[buffers], NULL);
+ error = get_user_pages_fast((unsigned long)base, npages,
+ 0, &pages[buffers]);
if (unlikely(error <= 0))
break;
@@ -1279,8 +1258,6 @@ static int get_iovec_page_array(const struct iovec __user *iov,
iov++;
}
- up_read(&current->mm->mmap_sem);
-
if (buffers)
return buffers;
diff --git a/fs/stat.c b/fs/stat.c
index 9cf41f719d5..7c46fbeb8b7 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -57,13 +57,13 @@ EXPORT_SYMBOL(vfs_getattr);
int vfs_stat_fd(int dfd, char __user *name, struct kstat *stat)
{
- struct nameidata nd;
+ struct path path;
int error;
- error = __user_walk_fd(dfd, name, LOOKUP_FOLLOW, &nd);
+ error = user_path_at(dfd, name, LOOKUP_FOLLOW, &path);
if (!error) {
- error = vfs_getattr(nd.path.mnt, nd.path.dentry, stat);
- path_put(&nd.path);
+ error = vfs_getattr(path.mnt, path.dentry, stat);
+ path_put(&path);
}
return error;
}
@@ -77,13 +77,13 @@ EXPORT_SYMBOL(vfs_stat);
int vfs_lstat_fd(int dfd, char __user *name, struct kstat *stat)
{
- struct nameidata nd;
+ struct path path;
int error;
- error = __user_walk_fd(dfd, name, 0, &nd);
+ error = user_path_at(dfd, name, 0, &path);
if (!error) {
- error = vfs_getattr(nd.path.mnt, nd.path.dentry, stat);
- path_put(&nd.path);
+ error = vfs_getattr(path.mnt, path.dentry, stat);
+ path_put(&path);
}
return error;
}
@@ -291,29 +291,29 @@ asmlinkage long sys_newfstat(unsigned int fd, struct stat __user *statbuf)
return error;
}
-asmlinkage long sys_readlinkat(int dfd, const char __user *path,
+asmlinkage long sys_readlinkat(int dfd, const char __user *pathname,
char __user *buf, int bufsiz)
{
- struct nameidata nd;
+ struct path path;
int error;
if (bufsiz <= 0)
return -EINVAL;
- error = __user_walk_fd(dfd, path, 0, &nd);
+ error = user_path_at(dfd, pathname, 0, &path);
if (!error) {
- struct inode *inode = nd.path.dentry->d_inode;
+ struct inode *inode = path.dentry->d_inode;
error = -EINVAL;
if (inode->i_op && inode->i_op->readlink) {
- error = security_inode_readlink(nd.path.dentry);
+ error = security_inode_readlink(path.dentry);
if (!error) {
- touch_atime(nd.path.mnt, nd.path.dentry);
- error = inode->i_op->readlink(nd.path.dentry,
+ touch_atime(path.mnt, path.dentry);
+ error = inode->i_op->readlink(path.dentry,
buf, bufsiz);
}
}
- path_put(&nd.path);
+ path_put(&path);
}
return error;
}
diff --git a/fs/super.c b/fs/super.c
index 453877c5697..e931ae9511f 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -70,6 +70,7 @@ static struct super_block *alloc_super(struct file_system_type *type)
INIT_LIST_HEAD(&s->s_instances);
INIT_HLIST_HEAD(&s->s_anon);
INIT_LIST_HEAD(&s->s_inodes);
+ INIT_LIST_HEAD(&s->s_dentry_lru);
init_rwsem(&s->s_umount);
mutex_init(&s->s_lock);
lockdep_set_class(&s->s_umount, &type->s_umount_key);
diff --git a/fs/sync.c b/fs/sync.c
index 228e17b5e9e..2967562d416 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -139,7 +139,8 @@ asmlinkage long sys_fdatasync(unsigned int fd)
* before performing the write.
*
* SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the
- * range which are not presently under writeback.
+ * range which are not presently under writeback. Note that this may block for
+ * significant periods due to exhaustion of disk request structures.
*
* SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range
* after performing the write.
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 8c0e4b92574..aedaeba82ae 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -398,7 +398,7 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
}
/**
- * sysfs_add_one - add sysfs_dirent to parent
+ * __sysfs_add_one - add sysfs_dirent to parent without warning
* @acxt: addrm context to use
* @sd: sysfs_dirent to be added
*
@@ -417,7 +417,7 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
* 0 on success, -EEXIST if entry with the given name already
* exists.
*/
-int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
+int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
{
if (sysfs_find_dirent(acxt->parent_sd, sd->s_name))
return -EEXIST;
@@ -435,6 +435,36 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
}
/**
+ * sysfs_add_one - add sysfs_dirent to parent
+ * @acxt: addrm context to use
+ * @sd: sysfs_dirent to be added
+ *
+ * Get @acxt->parent_sd and set sd->s_parent to it and increment
+ * nlink of parent inode if @sd is a directory and link into the
+ * children list of the parent.
+ *
+ * This function should be called between calls to
+ * sysfs_addrm_start() and sysfs_addrm_finish() and should be
+ * passed the same @acxt as passed to sysfs_addrm_start().
+ *
+ * LOCKING:
+ * Determined by sysfs_addrm_start().
+ *
+ * RETURNS:
+ * 0 on success, -EEXIST if entry with the given name already
+ * exists.
+ */
+int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
+{
+ int ret;
+
+ ret = __sysfs_add_one(acxt, sd);
+ WARN(ret == -EEXIST, KERN_WARNING "sysfs: duplicate filename '%s' "
+ "can not be created\n", sd->s_name);
+ return ret;
+}
+
+/**
* sysfs_remove_one - remove sysfs_dirent from parent
* @acxt: addrm context to use
* @sd: sysfs_dirent to be removed
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index e7735f643cd..c9e4e5091da 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -14,6 +14,7 @@
#include <linux/kobject.h>
#include <linux/kallsyms.h>
#include <linux/slab.h>
+#include <linux/fsnotify.h>
#include <linux/namei.h>
#include <linux/poll.h>
#include <linux/list.h>
@@ -336,9 +337,8 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
if (kobj->ktype && kobj->ktype->sysfs_ops)
ops = kobj->ktype->sysfs_ops;
else {
- printk(KERN_ERR "missing sysfs attribute operations for "
+ WARN(1, KERN_ERR "missing sysfs attribute operations for "
"kobject: %s\n", kobject_name(kobj));
- WARN_ON(1);
goto err_out;
}
@@ -585,9 +585,11 @@ int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode)
newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
- rc = notify_change(victim, &newattrs);
+ newattrs.ia_ctime = current_fs_time(inode->i_sb);
+ rc = sysfs_setattr(victim, &newattrs);
if (rc == 0) {
+ fsnotify_change(victim, newattrs.ia_valid);
mutex_lock(&sysfs_mutex);
victim_sd->s_mode = newattrs.ia_mode;
mutex_unlock(&sysfs_mutex);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index eeba38417b1..fe611949a7f 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -134,9 +134,8 @@ void sysfs_remove_group(struct kobject * kobj,
if (grp->name) {
sd = sysfs_get_dirent(dir_sd, grp->name);
if (!sd) {
- printk(KERN_WARNING "sysfs group %p not found for "
+ WARN(!sd, KERN_WARNING "sysfs group %p not found for "
"kobject '%s'\n", grp, kobject_name(kobj));
- WARN_ON(!sd);
return;
}
} else
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 817f5966edc..a3ba217fbe7 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -19,13 +19,8 @@
#include "sysfs.h"
-/**
- * sysfs_create_link - create symlink between two objects.
- * @kobj: object whose directory we're creating the link in.
- * @target: object we're pointing to.
- * @name: name of the symlink.
- */
-int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char * name)
+static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
+ const char *name, int warn)
{
struct sysfs_dirent *parent_sd = NULL;
struct sysfs_dirent *target_sd = NULL;
@@ -65,7 +60,10 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char
target_sd = NULL; /* reference is now owned by the symlink */
sysfs_addrm_start(&acxt, parent_sd);
- error = sysfs_add_one(&acxt, sd);
+ if (warn)
+ error = sysfs_add_one(&acxt, sd);
+ else
+ error = __sysfs_add_one(&acxt, sd);
sysfs_addrm_finish(&acxt);
if (error)
@@ -80,6 +78,33 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char
}
/**
+ * sysfs_create_link - create symlink between two objects.
+ * @kobj: object whose directory we're creating the link in.
+ * @target: object we're pointing to.
+ * @name: name of the symlink.
+ */
+int sysfs_create_link(struct kobject *kobj, struct kobject *target,
+ const char *name)
+{
+ return sysfs_do_create_link(kobj, target, name, 1);
+}
+
+/**
+ * sysfs_create_link_nowarn - create symlink between two objects.
+ * @kobj: object whose directory we're creating the link in.
+ * @target: object we're pointing to.
+ * @name: name of the symlink.
+ *
+ * This function does the same as sysf_create_link(), but it
+ * doesn't warn if the link already exists.
+ */
+int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target,
+ const char *name)
+{
+ return sysfs_do_create_link(kobj, target, name, 0);
+}
+
+/**
* sysfs_remove_link - remove symlink in object's directory.
* @kobj: object we're acting for.
* @name: name of the symlink to remove.
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index ce4e15f8aae..a5db496f71c 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -107,6 +107,7 @@ struct sysfs_dirent *sysfs_get_active_two(struct sysfs_dirent *sd);
void sysfs_put_active_two(struct sysfs_dirent *sd);
void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
struct sysfs_dirent *parent_sd);
+int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index c5d60de0658..df0d435baa4 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -326,7 +326,7 @@ static void sysv_destroy_inode(struct inode *inode)
kmem_cache_free(sysv_inode_cachep, SYSV_I(inode));
}
-static void init_once(struct kmem_cache *cachep, void *p)
+static void init_once(void *p)
{
struct sysv_inode_info *si = (struct sysv_inode_info *)p;
diff --git a/fs/timerfd.c b/fs/timerfd.c
index d87d354ec42..c502c60e4f5 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -184,7 +184,11 @@ asmlinkage long sys_timerfd_create(int clockid, int flags)
int ufd;
struct timerfd_ctx *ctx;
- if (flags)
+ /* Check the TFD_* constants for consistency. */
+ BUILD_BUG_ON(TFD_CLOEXEC != O_CLOEXEC);
+ BUILD_BUG_ON(TFD_NONBLOCK != O_NONBLOCK);
+
+ if (flags & ~(TFD_CLOEXEC | TFD_NONBLOCK))
return -EINVAL;
if (clockid != CLOCK_MONOTONIC &&
clockid != CLOCK_REALTIME)
@@ -198,7 +202,8 @@ asmlinkage long sys_timerfd_create(int clockid, int flags)
ctx->clockid = clockid;
hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS);
- ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx);
+ ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx,
+ flags & (O_CLOEXEC | O_NONBLOCK));
if (ufd < 0)
kfree(ctx);
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
new file mode 100644
index 00000000000..91ceeda7e5b
--- /dev/null
+++ b/fs/ubifs/Kconfig
@@ -0,0 +1,72 @@
+config UBIFS_FS
+ tristate "UBIFS file system support"
+ select CRC16
+ select CRC32
+ select CRYPTO if UBIFS_FS_ADVANCED_COMPR
+ select CRYPTO if UBIFS_FS_LZO
+ select CRYPTO if UBIFS_FS_ZLIB
+ select CRYPTO_LZO if UBIFS_FS_LZO
+ select CRYPTO_DEFLATE if UBIFS_FS_ZLIB
+ depends on MTD_UBI
+ help
+ UBIFS is a file system for flash devices which works on top of UBI.
+
+config UBIFS_FS_XATTR
+ bool "Extended attributes support"
+ depends on UBIFS_FS
+ help
+ This option enables support of extended attributes.
+
+config UBIFS_FS_ADVANCED_COMPR
+ bool "Advanced compression options"
+ depends on UBIFS_FS
+ help
+ This option allows to explicitly choose which compressions, if any,
+ are enabled in UBIFS. Removing compressors means inbility to read
+ existing file systems.
+
+ If unsure, say 'N'.
+
+config UBIFS_FS_LZO
+ bool "LZO compression support" if UBIFS_FS_ADVANCED_COMPR
+ depends on UBIFS_FS
+ default y
+ help
+ LZO compressor is generally faster then zlib but compresses worse.
+ Say 'Y' if unsure.
+
+config UBIFS_FS_ZLIB
+ bool "ZLIB compression support" if UBIFS_FS_ADVANCED_COMPR
+ depends on UBIFS_FS
+ default y
+ help
+ Zlib copresses better then LZO but it is slower. Say 'Y' if unsure.
+
+# Debugging-related stuff
+config UBIFS_FS_DEBUG
+ bool "Enable debugging"
+ depends on UBIFS_FS
+ select DEBUG_FS
+ select KALLSYMS_ALL
+ help
+ This option enables UBIFS debugging.
+
+config UBIFS_FS_DEBUG_MSG_LVL
+ int "Default message level (0 = no extra messages, 3 = lots)"
+ depends on UBIFS_FS_DEBUG
+ default "0"
+ help
+ This controls the amount of debugging messages produced by UBIFS.
+ If reporting bugs, please try to have available a full dump of the
+ messages at level 1 while the misbehaviour was occurring. Level 2
+ may become necessary if level 1 messages were not enough to find the
+ bug. Generally Level 3 should be avoided.
+
+config UBIFS_FS_DEBUG_CHKS
+ bool "Enable extra checks"
+ depends on UBIFS_FS_DEBUG
+ help
+ If extra checks are enabled UBIFS will check the consistency of its
+ internal data structures during operation. However, UBIFS performance
+ is dramatically slower when this option is selected especially if the
+ file system is large.
diff --git a/fs/ubifs/Makefile b/fs/ubifs/Makefile
new file mode 100644
index 00000000000..80e93c35e49
--- /dev/null
+++ b/fs/ubifs/Makefile
@@ -0,0 +1,9 @@
+obj-$(CONFIG_UBIFS_FS) += ubifs.o
+
+ubifs-y += shrinker.o journal.o file.o dir.o super.o sb.o io.o
+ubifs-y += tnc.o master.o scan.o replay.o log.o commit.o gc.o orphan.o
+ubifs-y += budget.o find.o tnc_commit.o compress.o lpt.o lprops.o
+ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o
+
+ubifs-$(CONFIG_UBIFS_FS_DEBUG) += debug.o
+ubifs-$(CONFIG_UBIFS_FS_XATTR) += xattr.o
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
new file mode 100644
index 00000000000..73db464cd08
--- /dev/null
+++ b/fs/ubifs/budget.c
@@ -0,0 +1,812 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ * Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements the budgeting sub-system which is responsible for UBIFS
+ * space management.
+ *
+ * Factors such as compression, wasted space at the ends of LEBs, space in other
+ * journal heads, the effect of updates on the index, and so on, make it
+ * impossible to accurately predict the amount of space needed. Consequently
+ * approximations are used.
+ */
+
+#include "ubifs.h"
+#include <linux/writeback.h>
+#include <asm/div64.h>
+
+/*
+ * When pessimistic budget calculations say that there is no enough space,
+ * UBIFS starts writing back dirty inodes and pages, doing garbage collection,
+ * or committing. The below constants define maximum number of times UBIFS
+ * repeats the operations.
+ */
+#define MAX_SHRINK_RETRIES 8
+#define MAX_GC_RETRIES 4
+#define MAX_CMT_RETRIES 2
+#define MAX_NOSPC_RETRIES 1
+
+/*
+ * The below constant defines amount of dirty pages which should be written
+ * back at when trying to shrink the liability.
+ */
+#define NR_TO_WRITE 16
+
+/**
+ * struct retries_info - information about re-tries while making free space.
+ * @prev_liability: previous liability
+ * @shrink_cnt: how many times the liability was shrinked
+ * @shrink_retries: count of liability shrink re-tries (increased when
+ * liability does not shrink)
+ * @try_gc: GC should be tried first
+ * @gc_retries: how many times GC was run
+ * @cmt_retries: how many times commit has been done
+ * @nospc_retries: how many times GC returned %-ENOSPC
+ *
+ * Since we consider budgeting to be the fast-path, and this structure has to
+ * be allocated on stack and zeroed out, we make it smaller using bit-fields.
+ */
+struct retries_info {
+ long long prev_liability;
+ unsigned int shrink_cnt;
+ unsigned int shrink_retries:5;
+ unsigned int try_gc:1;
+ unsigned int gc_retries:4;
+ unsigned int cmt_retries:3;
+ unsigned int nospc_retries:1;
+};
+
+/**
+ * shrink_liability - write-back some dirty pages/inodes.
+ * @c: UBIFS file-system description object
+ * @nr_to_write: how many dirty pages to write-back
+ *
+ * This function shrinks UBIFS liability by means of writing back some amount
+ * of dirty inodes and their pages. Returns the amount of pages which were
+ * written back. The returned value does not include dirty inodes which were
+ * synchronized.
+ *
+ * Note, this function synchronizes even VFS inodes which are locked
+ * (@i_mutex) by the caller of the budgeting function, because write-back does
+ * not touch @i_mutex.
+ */
+static int shrink_liability(struct ubifs_info *c, int nr_to_write)
+{
+ int nr_written;
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_NONE,
+ .range_end = LLONG_MAX,
+ .nr_to_write = nr_to_write,
+ };
+
+ generic_sync_sb_inodes(c->vfs_sb, &wbc);
+ nr_written = nr_to_write - wbc.nr_to_write;
+
+ if (!nr_written) {
+ /*
+ * Re-try again but wait on pages/inodes which are being
+ * written-back concurrently (e.g., by pdflush).
+ */
+ memset(&wbc, 0, sizeof(struct writeback_control));
+ wbc.sync_mode = WB_SYNC_ALL;
+ wbc.range_end = LLONG_MAX;
+ wbc.nr_to_write = nr_to_write;
+ generic_sync_sb_inodes(c->vfs_sb, &wbc);
+ nr_written = nr_to_write - wbc.nr_to_write;
+ }
+
+ dbg_budg("%d pages were written back", nr_written);
+ return nr_written;
+}
+
+
+/**
+ * run_gc - run garbage collector.
+ * @c: UBIFS file-system description object
+ *
+ * This function runs garbage collector to make some more free space. Returns
+ * zero if a free LEB has been produced, %-EAGAIN if commit is required, and a
+ * negative error code in case of failure.
+ */
+static int run_gc(struct ubifs_info *c)
+{
+ int err, lnum;
+
+ /* Make some free space by garbage-collecting dirty space */
+ down_read(&c->commit_sem);
+ lnum = ubifs_garbage_collect(c, 1);
+ up_read(&c->commit_sem);
+ if (lnum < 0)
+ return lnum;
+
+ /* GC freed one LEB, return it to lprops */
+ dbg_budg("GC freed LEB %d", lnum);
+ err = ubifs_return_leb(c, lnum);
+ if (err)
+ return err;
+ return 0;
+}
+
+/**
+ * make_free_space - make more free space on the file-system.
+ * @c: UBIFS file-system description object
+ * @ri: information about previous invocations of this function
+ *
+ * This function is called when an operation cannot be budgeted because there
+ * is supposedly no free space. But in most cases there is some free space:
+ * o budgeting is pessimistic, so it always budgets more then it is actually
+ * needed, so shrinking the liability is one way to make free space - the
+ * cached data will take less space then it was budgeted for;
+ * o GC may turn some dark space into free space (budgeting treats dark space
+ * as not available);
+ * o commit may free some LEB, i.e., turn freeable LEBs into free LEBs.
+ *
+ * So this function tries to do the above. Returns %-EAGAIN if some free space
+ * was presumably made and the caller has to re-try budgeting the operation.
+ * Returns %-ENOSPC if it couldn't do more free space, and other negative error
+ * codes on failures.
+ */
+static int make_free_space(struct ubifs_info *c, struct retries_info *ri)
+{
+ int err;
+
+ /*
+ * If we have some dirty pages and inodes (liability), try to write
+ * them back unless this was tried too many times without effect
+ * already.
+ */
+ if (ri->shrink_retries < MAX_SHRINK_RETRIES && !ri->try_gc) {
+ long long liability;
+
+ spin_lock(&c->space_lock);
+ liability = c->budg_idx_growth + c->budg_data_growth +
+ c->budg_dd_growth;
+ spin_unlock(&c->space_lock);
+
+ if (ri->prev_liability >= liability) {
+ /* Liability does not shrink, next time try GC then */
+ ri->shrink_retries += 1;
+ if (ri->gc_retries < MAX_GC_RETRIES)
+ ri->try_gc = 1;
+ dbg_budg("liability did not shrink: retries %d of %d",
+ ri->shrink_retries, MAX_SHRINK_RETRIES);
+ }
+
+ dbg_budg("force write-back (count %d)", ri->shrink_cnt);
+ shrink_liability(c, NR_TO_WRITE + ri->shrink_cnt);
+
+ ri->prev_liability = liability;
+ ri->shrink_cnt += 1;
+ return -EAGAIN;
+ }
+
+ /*
+ * Try to run garbage collector unless it was already tried too many
+ * times.
+ */
+ if (ri->gc_retries < MAX_GC_RETRIES) {
+ ri->gc_retries += 1;
+ dbg_budg("run GC, retries %d of %d",
+ ri->gc_retries, MAX_GC_RETRIES);
+
+ ri->try_gc = 0;
+ err = run_gc(c);
+ if (!err)
+ return -EAGAIN;
+
+ if (err == -EAGAIN) {
+ dbg_budg("GC asked to commit");
+ err = ubifs_run_commit(c);
+ if (err)
+ return err;
+ return -EAGAIN;
+ }
+
+ if (err != -ENOSPC)
+ return err;
+
+ /*
+ * GC could not make any progress. If this is the first time,
+ * then it makes sense to try to commit, because it might make
+ * some dirty space.
+ */
+ dbg_budg("GC returned -ENOSPC, retries %d",
+ ri->nospc_retries);
+ if (ri->nospc_retries >= MAX_NOSPC_RETRIES)
+ return err;
+ ri->nospc_retries += 1;
+ }
+
+ /* Neither GC nor write-back helped, try to commit */
+ if (ri->cmt_retries < MAX_CMT_RETRIES) {
+ ri->cmt_retries += 1;
+ dbg_budg("run commit, retries %d of %d",
+ ri->cmt_retries, MAX_CMT_RETRIES);
+ err = ubifs_run_commit(c);
+ if (err)
+ return err;
+ return -EAGAIN;
+ }
+ return -ENOSPC;
+}
+
+/**
+ * ubifs_calc_min_idx_lebs - calculate amount of eraseblocks for the index.
+ * @c: UBIFS file-system description object
+ *
+ * This function calculates and returns the number of eraseblocks which should
+ * be kept for index usage.
+ */
+int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
+{
+ int ret;
+ uint64_t idx_size;
+
+ idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx;
+
+ /* And make sure we have thrice the index size of space reserved */
+ idx_size = idx_size + (idx_size << 1);
+
+ /*
+ * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes'
+ * pair, nor similarly the two variables for the new index size, so we
+ * have to do this costly 64-bit division on fast-path.
+ */
+ if (do_div(idx_size, c->leb_size - c->max_idx_node_sz))
+ ret = idx_size + 1;
+ else
+ ret = idx_size;
+ /*
+ * The index head is not available for the in-the-gaps method, so add an
+ * extra LEB to compensate.
+ */
+ ret += 1;
+ /*
+ * At present the index needs at least 2 LEBs: one for the index head
+ * and one for in-the-gaps method (which currently does not cater for
+ * the index head and so excludes it from consideration).
+ */
+ if (ret < 2)
+ ret = 2;
+ return ret;
+}
+
+/**
+ * ubifs_calc_available - calculate available FS space.
+ * @c: UBIFS file-system description object
+ * @min_idx_lebs: minimum number of LEBs reserved for the index
+ *
+ * This function calculates and returns amount of FS space available for use.
+ */
+long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs)
+{
+ int subtract_lebs;
+ long long available;
+
+ available = c->main_bytes - c->lst.total_used;
+
+ /*
+ * Now 'available' contains theoretically available flash space
+ * assuming there is no index, so we have to subtract the space which
+ * is reserved for the index.
+ */
+ subtract_lebs = min_idx_lebs;
+
+ /* Take into account that GC reserves one LEB for its own needs */
+ subtract_lebs += 1;
+
+ /*
+ * The GC journal head LEB is not really accessible. And since
+ * different write types go to different heads, we may count only on
+ * one head's space.
+ */
+ subtract_lebs += c->jhead_cnt - 1;
+
+ /* We also reserve one LEB for deletions, which bypass budgeting */
+ subtract_lebs += 1;
+
+ available -= (long long)subtract_lebs * c->leb_size;
+
+ /* Subtract the dead space which is not available for use */
+ available -= c->lst.total_dead;
+
+ /*
+ * Subtract dark space, which might or might not be usable - it depends
+ * on the data which we have on the media and which will be written. If
+ * this is a lot of uncompressed or not-compressible data, the dark
+ * space cannot be used.
+ */
+ available -= c->lst.total_dark;
+
+ /*
+ * However, there is more dark space. The index may be bigger than
+ * @min_idx_lebs. Those extra LEBs are assumed to be available, but
+ * their dark space is not included in total_dark, so it is subtracted
+ * here.
+ */
+ if (c->lst.idx_lebs > min_idx_lebs) {
+ subtract_lebs = c->lst.idx_lebs - min_idx_lebs;
+ available -= subtract_lebs * c->dark_wm;
+ }
+
+ /* The calculations are rough and may end up with a negative number */
+ return available > 0 ? available : 0;
+}
+
+/**
+ * can_use_rp - check whether the user is allowed to use reserved pool.
+ * @c: UBIFS file-system description object
+ *
+ * UBIFS has so-called "reserved pool" which is flash space reserved
+ * for the superuser and for uses whose UID/GID is recorded in UBIFS superblock.
+ * This function checks whether current user is allowed to use reserved pool.
+ * Returns %1 current user is allowed to use reserved pool and %0 otherwise.
+ */
+static int can_use_rp(struct ubifs_info *c)
+{
+ if (current->fsuid == c->rp_uid || capable(CAP_SYS_RESOURCE) ||
+ (c->rp_gid != 0 && in_group_p(c->rp_gid)))
+ return 1;
+ return 0;
+}
+
+/**
+ * do_budget_space - reserve flash space for index and data growth.
+ * @c: UBIFS file-system description object
+ *
+ * This function makes sure UBIFS has enough free eraseblocks for index growth
+ * and data.
+ *
+ * When budgeting index space, UBIFS reserves thrice as many LEBs as the index
+ * would take if it was consolidated and written to the flash. This guarantees
+ * that the "in-the-gaps" commit method always succeeds and UBIFS will always
+ * be able to commit dirty index. So this function basically adds amount of
+ * budgeted index space to the size of the current index, multiplies this by 3,
+ * and makes sure this does not exceed the amount of free eraseblocks.
+ *
+ * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables:
+ * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might
+ * be large, because UBIFS does not do any index consolidation as long as
+ * there is free space. IOW, the index may take a lot of LEBs, but the LEBs
+ * will contain a lot of dirt.
+ * o @c->min_idx_lebs is the the index presumably takes. IOW, the index may be
+ * consolidated to take up to @c->min_idx_lebs LEBs.
+ *
+ * This function returns zero in case of success, and %-ENOSPC in case of
+ * failure.
+ */
+static int do_budget_space(struct ubifs_info *c)
+{
+ long long outstanding, available;
+ int lebs, rsvd_idx_lebs, min_idx_lebs;
+
+ /* First budget index space */
+ min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+
+ /* Now 'min_idx_lebs' contains number of LEBs to reserve */
+ if (min_idx_lebs > c->lst.idx_lebs)
+ rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs;
+ else
+ rsvd_idx_lebs = 0;
+
+ /*
+ * The number of LEBs that are available to be used by the index is:
+ *
+ * @c->lst.empty_lebs + @c->freeable_cnt + @c->idx_gc_cnt -
+ * @c->lst.taken_empty_lebs
+ *
+ * @empty_lebs are available because they are empty. @freeable_cnt are
+ * available because they contain only free and dirty space and the
+ * index allocation always occurs after wbufs are synch'ed.
+ * @idx_gc_cnt are available because they are index LEBs that have been
+ * garbage collected (including trivial GC) and are awaiting the commit
+ * before they can be unmapped - note that the in-the-gaps method will
+ * grab these if it needs them. @taken_empty_lebs are empty_lebs that
+ * have already been allocated for some purpose (also includes those
+ * LEBs on the @idx_gc list).
+ *
+ * Note, @taken_empty_lebs may temporarily be higher by one because of
+ * the way we serialize LEB allocations and budgeting. See a comment in
+ * 'ubifs_find_free_space()'.
+ */
+ lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
+ c->lst.taken_empty_lebs;
+ if (unlikely(rsvd_idx_lebs > lebs)) {
+ dbg_budg("out of indexing space: min_idx_lebs %d (old %d), "
+ "rsvd_idx_lebs %d", min_idx_lebs, c->min_idx_lebs,
+ rsvd_idx_lebs);
+ return -ENOSPC;
+ }
+
+ available = ubifs_calc_available(c, min_idx_lebs);
+ outstanding = c->budg_data_growth + c->budg_dd_growth;
+
+ if (unlikely(available < outstanding)) {
+ dbg_budg("out of data space: available %lld, outstanding %lld",
+ available, outstanding);
+ return -ENOSPC;
+ }
+
+ if (available - outstanding <= c->rp_size && !can_use_rp(c))
+ return -ENOSPC;
+
+ c->min_idx_lebs = min_idx_lebs;
+ return 0;
+}
+
+/**
+ * calc_idx_growth - calculate approximate index growth from budgeting request.
+ * @c: UBIFS file-system description object
+ * @req: budgeting request
+ *
+ * For now we assume each new node adds one znode. But this is rather poor
+ * approximation, though.
+ */
+static int calc_idx_growth(const struct ubifs_info *c,
+ const struct ubifs_budget_req *req)
+{
+ int znodes;
+
+ znodes = req->new_ino + (req->new_page << UBIFS_BLOCKS_PER_PAGE_SHIFT) +
+ req->new_dent;
+ return znodes * c->max_idx_node_sz;
+}
+
+/**
+ * calc_data_growth - calculate approximate amount of new data from budgeting
+ * request.
+ * @c: UBIFS file-system description object
+ * @req: budgeting request
+ */
+static int calc_data_growth(const struct ubifs_info *c,
+ const struct ubifs_budget_req *req)
+{
+ int data_growth;
+
+ data_growth = req->new_ino ? c->inode_budget : 0;
+ if (req->new_page)
+ data_growth += c->page_budget;
+ if (req->new_dent)
+ data_growth += c->dent_budget;
+ data_growth += req->new_ino_d;
+ return data_growth;
+}
+
+/**
+ * calc_dd_growth - calculate approximate amount of data which makes other data
+ * dirty from budgeting request.
+ * @c: UBIFS file-system description object
+ * @req: budgeting request
+ */
+static int calc_dd_growth(const struct ubifs_info *c,
+ const struct ubifs_budget_req *req)
+{
+ int dd_growth;
+
+ dd_growth = req->dirtied_page ? c->page_budget : 0;
+
+ if (req->dirtied_ino)
+ dd_growth += c->inode_budget << (req->dirtied_ino - 1);
+ if (req->mod_dent)
+ dd_growth += c->dent_budget;
+ dd_growth += req->dirtied_ino_d;
+ return dd_growth;
+}
+
+/**
+ * ubifs_budget_space - ensure there is enough space to complete an operation.
+ * @c: UBIFS file-system description object
+ * @req: budget request
+ *
+ * This function allocates budget for an operation. It uses pessimistic
+ * approximation of how much flash space the operation needs. The goal of this
+ * function is to make sure UBIFS always has flash space to flush all dirty
+ * pages, dirty inodes, and dirty znodes (liability). This function may force
+ * commit, garbage-collection or write-back. Returns zero in case of success,
+ * %-ENOSPC if there is no free space and other negative error codes in case of
+ * failures.
+ */
+int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req)
+{
+ int uninitialized_var(cmt_retries), uninitialized_var(wb_retries);
+ int err, idx_growth, data_growth, dd_growth;
+ struct retries_info ri;
+
+ ubifs_assert(req->new_page <= 1);
+ ubifs_assert(req->dirtied_page <= 1);
+ ubifs_assert(req->new_dent <= 1);
+ ubifs_assert(req->mod_dent <= 1);
+ ubifs_assert(req->new_ino <= 1);
+ ubifs_assert(req->new_ino_d <= UBIFS_MAX_INO_DATA);
+ ubifs_assert(req->dirtied_ino <= 4);
+ ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4);
+ ubifs_assert(!(req->new_ino_d & 7));
+ ubifs_assert(!(req->dirtied_ino_d & 7));
+
+ data_growth = calc_data_growth(c, req);
+ dd_growth = calc_dd_growth(c, req);
+ if (!data_growth && !dd_growth)
+ return 0;
+ idx_growth = calc_idx_growth(c, req);
+ memset(&ri, 0, sizeof(struct retries_info));
+
+again:
+ spin_lock(&c->space_lock);
+ ubifs_assert(c->budg_idx_growth >= 0);
+ ubifs_assert(c->budg_data_growth >= 0);
+ ubifs_assert(c->budg_dd_growth >= 0);
+
+ if (unlikely(c->nospace) && (c->nospace_rp || !can_use_rp(c))) {
+ dbg_budg("no space");
+ spin_unlock(&c->space_lock);
+ return -ENOSPC;
+ }
+
+ c->budg_idx_growth += idx_growth;
+ c->budg_data_growth += data_growth;
+ c->budg_dd_growth += dd_growth;
+
+ err = do_budget_space(c);
+ if (likely(!err)) {
+ req->idx_growth = idx_growth;
+ req->data_growth = data_growth;
+ req->dd_growth = dd_growth;
+ spin_unlock(&c->space_lock);
+ return 0;
+ }
+
+ /* Restore the old values */
+ c->budg_idx_growth -= idx_growth;
+ c->budg_data_growth -= data_growth;
+ c->budg_dd_growth -= dd_growth;
+ spin_unlock(&c->space_lock);
+
+ if (req->fast) {
+ dbg_budg("no space for fast budgeting");
+ return err;
+ }
+
+ err = make_free_space(c, &ri);
+ if (err == -EAGAIN) {
+ dbg_budg("try again");
+ cond_resched();
+ goto again;
+ } else if (err == -ENOSPC) {
+ dbg_budg("FS is full, -ENOSPC");
+ c->nospace = 1;
+ if (can_use_rp(c) || c->rp_size == 0)
+ c->nospace_rp = 1;
+ smp_wmb();
+ } else
+ ubifs_err("cannot budget space, error %d", err);
+ return err;
+}
+
+/**
+ * ubifs_release_budget - release budgeted free space.
+ * @c: UBIFS file-system description object
+ * @req: budget request
+ *
+ * This function releases the space budgeted by 'ubifs_budget_space()'. Note,
+ * since the index changes (which were budgeted for in @req->idx_growth) will
+ * only be written to the media on commit, this function moves the index budget
+ * from @c->budg_idx_growth to @c->budg_uncommitted_idx. The latter will be
+ * zeroed by the commit operation.
+ */
+void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req)
+{
+ ubifs_assert(req->new_page <= 1);
+ ubifs_assert(req->dirtied_page <= 1);
+ ubifs_assert(req->new_dent <= 1);
+ ubifs_assert(req->mod_dent <= 1);
+ ubifs_assert(req->new_ino <= 1);
+ ubifs_assert(req->new_ino_d <= UBIFS_MAX_INO_DATA);
+ ubifs_assert(req->dirtied_ino <= 4);
+ ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4);
+ ubifs_assert(!(req->new_ino_d & 7));
+ ubifs_assert(!(req->dirtied_ino_d & 7));
+ if (!req->recalculate) {
+ ubifs_assert(req->idx_growth >= 0);
+ ubifs_assert(req->data_growth >= 0);
+ ubifs_assert(req->dd_growth >= 0);
+ }
+
+ if (req->recalculate) {
+ req->data_growth = calc_data_growth(c, req);
+ req->dd_growth = calc_dd_growth(c, req);
+ req->idx_growth = calc_idx_growth(c, req);
+ }
+
+ if (!req->data_growth && !req->dd_growth)
+ return;
+
+ c->nospace = c->nospace_rp = 0;
+ smp_wmb();
+
+ spin_lock(&c->space_lock);
+ c->budg_idx_growth -= req->idx_growth;
+ c->budg_uncommitted_idx += req->idx_growth;
+ c->budg_data_growth -= req->data_growth;
+ c->budg_dd_growth -= req->dd_growth;
+ c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+
+ ubifs_assert(c->budg_idx_growth >= 0);
+ ubifs_assert(c->budg_data_growth >= 0);
+ ubifs_assert(c->budg_dd_growth >= 0);
+ ubifs_assert(c->min_idx_lebs < c->main_lebs);
+ ubifs_assert(!(c->budg_idx_growth & 7));
+ ubifs_assert(!(c->budg_data_growth & 7));
+ ubifs_assert(!(c->budg_dd_growth & 7));
+ spin_unlock(&c->space_lock);
+}
+
+/**
+ * ubifs_convert_page_budget - convert budget of a new page.
+ * @c: UBIFS file-system description object
+ *
+ * This function converts budget which was allocated for a new page of data to
+ * the budget of changing an existing page of data. The latter is smaller then
+ * the former, so this function only does simple re-calculation and does not
+ * involve any write-back.
+ */
+void ubifs_convert_page_budget(struct ubifs_info *c)
+{
+ spin_lock(&c->space_lock);
+ /* Release the index growth reservation */
+ c->budg_idx_growth -= c->max_idx_node_sz << UBIFS_BLOCKS_PER_PAGE_SHIFT;
+ /* Release the data growth reservation */
+ c->budg_data_growth -= c->page_budget;
+ /* Increase the dirty data growth reservation instead */
+ c->budg_dd_growth += c->page_budget;
+ /* And re-calculate the indexing space reservation */
+ c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+ spin_unlock(&c->space_lock);
+}
+
+/**
+ * ubifs_release_dirty_inode_budget - release dirty inode budget.
+ * @c: UBIFS file-system description object
+ * @ui: UBIFS inode to release the budget for
+ *
+ * This function releases budget corresponding to a dirty inode. It is usually
+ * called when after the inode has been written to the media and marked as
+ * clean.
+ */
+void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
+ struct ubifs_inode *ui)
+{
+ struct ubifs_budget_req req;
+
+ memset(&req, 0, sizeof(struct ubifs_budget_req));
+ req.dd_growth = c->inode_budget + ALIGN(ui->data_len, 8);
+ ubifs_release_budget(c, &req);
+}
+
+/**
+ * ubifs_reported_space - calculate reported free space.
+ * @c: the UBIFS file-system description object
+ * @free: amount of free space
+ *
+ * This function calculates amount of free space which will be reported to
+ * user-space. User-space application tend to expect that if the file-system
+ * (e.g., via the 'statfs()' call) reports that it has N bytes available, they
+ * are able to write a file of size N. UBIFS attaches node headers to each data
+ * node and it has to write indexind nodes as well. This introduces additional
+ * overhead, and UBIFS it has to report sligtly less free space to meet the
+ * above expectetion.
+ *
+ * This function assumes free space is made up of uncompressed data nodes and
+ * full index nodes (one per data node, tripled because we always allow enough
+ * space to write the index thrice).
+ *
+ * Note, the calculation is pessimistic, which means that most of the time
+ * UBIFS reports less space than it actually has.
+ */
+long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free)
+{
+ int divisor, factor, f;
+
+ /*
+ * Reported space size is @free * X, where X is UBIFS block size
+ * divided by UBIFS block size + all overhead one data block
+ * introduces. The overhead is the node header + indexing overhead.
+ *
+ * Indexing overhead calculations are based on the following formula:
+ * I = N/(f - 1) + 1, where I - number of indexing nodes, N - number
+ * of data nodes, f - fanout. Because effective UBIFS fanout is twice
+ * as less than maximum fanout, we assume that each data node
+ * introduces 3 * @c->max_idx_node_sz / (@c->fanout/2 - 1) bytes.
+ * Note, the multiplier 3 is because UBIFS reseves thrice as more space
+ * for the index.
+ */
+ f = c->fanout > 3 ? c->fanout >> 1 : 2;
+ factor = UBIFS_BLOCK_SIZE;
+ divisor = UBIFS_MAX_DATA_NODE_SZ;
+ divisor += (c->max_idx_node_sz * 3) / (f - 1);
+ free *= factor;
+ do_div(free, divisor);
+ return free;
+}
+
+/**
+ * ubifs_get_free_space - return amount of free space.
+ * @c: UBIFS file-system description object
+ *
+ * This function calculates amount of free space to report to user-space.
+ *
+ * Because UBIFS may introduce substantial overhead (the index, node headers,
+ * alighment, wastage at the end of eraseblocks, etc), it cannot report real
+ * amount of free flash space it has (well, because not all dirty space is
+ * reclamable, UBIFS does not actually know the real amount). If UBIFS did so,
+ * it would bread user expectetion about what free space is. Users seem to
+ * accustomed to assume that if the file-system reports N bytes of free space,
+ * they would be able to fit a file of N bytes to the FS. This almost works for
+ * traditional file-systems, because they have way less overhead than UBIFS.
+ * So, to keep users happy, UBIFS tries to take the overhead into account.
+ */
+long long ubifs_get_free_space(struct ubifs_info *c)
+{
+ int min_idx_lebs, rsvd_idx_lebs, lebs;
+ long long available, outstanding, free;
+
+ spin_lock(&c->space_lock);
+ min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+ outstanding = c->budg_data_growth + c->budg_dd_growth;
+
+ /*
+ * Force the amount available to the total size reported if the used
+ * space is zero.
+ */
+ if (c->lst.total_used <= UBIFS_INO_NODE_SZ && !outstanding) {
+ spin_unlock(&c->space_lock);
+ return (long long)c->block_cnt << UBIFS_BLOCK_SHIFT;
+ }
+
+ available = ubifs_calc_available(c, min_idx_lebs);
+
+ /*
+ * When reporting free space to user-space, UBIFS guarantees that it is
+ * possible to write a file of free space size. This means that for
+ * empty LEBs we may use more precise calculations than
+ * 'ubifs_calc_available()' is using. Namely, we know that in empty
+ * LEBs we would waste only @c->leb_overhead bytes, not @c->dark_wm.
+ * Thus, amend the available space.
+ *
+ * Note, the calculations below are similar to what we have in
+ * 'do_budget_space()', so refer there for comments.
+ */
+ if (min_idx_lebs > c->lst.idx_lebs)
+ rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs;
+ else
+ rsvd_idx_lebs = 0;
+ lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
+ c->lst.taken_empty_lebs;
+ lebs -= rsvd_idx_lebs;
+ available += lebs * (c->dark_wm - c->leb_overhead);
+ spin_unlock(&c->space_lock);
+
+ if (available > outstanding)
+ free = ubifs_reported_space(c, available - outstanding);
+ else
+ free = 0;
+ return free;
+}
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
new file mode 100644
index 00000000000..0a6aa2cc78f
--- /dev/null
+++ b/fs/ubifs/commit.c
@@ -0,0 +1,678 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ * Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements functions that manage the running of the commit process.
+ * Each affected module has its own functions to accomplish their part in the
+ * commit and those functions are called here.
+ *
+ * The commit is the process whereby all updates to the index and LEB properties
+ * are written out together and the journal becomes empty. This keeps the
+ * file system consistent - at all times the state can be recreated by reading
+ * the index and LEB properties and then replaying the journal.
+ *
+ * The commit is split into two parts named "commit start" and "commit end".
+ * During commit start, the commit process has exclusive access to the journal
+ * by holding the commit semaphore down for writing. As few I/O operations as
+ * possible are performed during commit start, instead the nodes that are to be
+ * written are merely identified. During commit end, the commit semaphore is no
+ * longer held and the journal is again in operation, allowing users to continue
+ * to use the file system while the bulk of the commit I/O is performed. The
+ * purpose of this two-step approach is to prevent the commit from causing any
+ * latency blips. Note that in any case, the commit does not prevent lookups
+ * (as permitted by the TNC mutex), or access to VFS data structures e.g. page
+ * cache.
+ */
+
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include "ubifs.h"
+
+/**
+ * do_commit - commit the journal.
+ * @c: UBIFS file-system description object
+ *
+ * This function implements UBIFS commit. It has to be called with commit lock
+ * locked. Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+static int do_commit(struct ubifs_info *c)
+{
+ int err, new_ltail_lnum, old_ltail_lnum, i;
+ struct ubifs_zbranch zroot;
+ struct ubifs_lp_stats lst;
+
+ dbg_cmt("start");
+ if (c->ro_media) {
+ err = -EROFS;
+ goto out_up;
+ }
+
+ /* Sync all write buffers (necessary for recovery) */
+ for (i = 0; i < c->jhead_cnt; i++) {
+ err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
+ if (err)
+ goto out_up;
+ }
+
+ c->cmt_no += 1;
+ err = ubifs_gc_start_commit(c);
+ if (err)
+ goto out_up;
+ err = dbg_check_lprops(c);
+ if (err)
+ goto out_up;
+ err = ubifs_log_start_commit(c, &new_ltail_lnum);
+ if (err)
+ goto out_up;
+ err = ubifs_tnc_start_commit(c, &zroot);
+ if (err)
+ goto out_up;
+ err = ubifs_lpt_start_commit(c);
+ if (err)
+ goto out_up;
+ err = ubifs_orphan_start_commit(c);
+ if (err)
+ goto out_up;
+
+ ubifs_get_lp_stats(c, &lst);
+
+ up_write(&c->commit_sem);
+
+ err = ubifs_tnc_end_commit(c);
+ if (err)
+ goto out;
+ err = ubifs_lpt_end_commit(c);
+ if (err)
+ goto out;
+ err = ubifs_orphan_end_commit(c);
+ if (err)
+ goto out;
+ old_ltail_lnum = c->ltail_lnum;
+ err = ubifs_log_end_commit(c, new_ltail_lnum);
+ if (err)
+ goto out;
+ err = dbg_check_old_index(c, &zroot);
+ if (err)
+ goto out;
+
+ mutex_lock(&c->mst_mutex);
+ c->mst_node->cmt_no = cpu_to_le64(c->cmt_no);
+ c->mst_node->log_lnum = cpu_to_le32(new_ltail_lnum);
+ c->mst_node->root_lnum = cpu_to_le32(zroot.lnum);
+ c->mst_node->root_offs = cpu_to_le32(zroot.offs);
+ c->mst_node->root_len = cpu_to_le32(zroot.len);
+ c->mst_node->ihead_lnum = cpu_to_le32(c->ihead_lnum);
+ c->mst_node->ihead_offs = cpu_to_le32(c->ihead_offs);
+ c->mst_node->index_size = cpu_to_le64(c->old_idx_sz);
+ c->mst_node->lpt_lnum = cpu_to_le32(c->lpt_lnum);
+ c->mst_node->lpt_offs = cpu_to_le32(c->lpt_offs);
+ c->mst_node->nhead_lnum = cpu_to_le32(c->nhead_lnum);
+ c->mst_node->nhead_offs = cpu_to_le32(c->nhead_offs);
+ c->mst_node->ltab_lnum = cpu_to_le32(c->ltab_lnum);
+ c->mst_node->ltab_offs = cpu_to_le32(c->ltab_offs);
+ c->mst_node->lsave_lnum = cpu_to_le32(c->lsave_lnum);
+ c->mst_node->lsave_offs = cpu_to_le32(c->lsave_offs);
+ c->mst_node->lscan_lnum = cpu_to_le32(c->lscan_lnum);
+ c->mst_node->empty_lebs = cpu_to_le32(lst.empty_lebs);
+ c->mst_node->idx_lebs = cpu_to_le32(lst.idx_lebs);
+ c->mst_node->total_free = cpu_to_le64(lst.total_free);
+ c->mst_node->total_dirty = cpu_to_le64(lst.total_dirty);
+ c->mst_node->total_used = cpu_to_le64(lst.total_used);
+ c->mst_node->total_dead = cpu_to_le64(lst.total_dead);
+ c->mst_node->total_dark = cpu_to_le64(lst.total_dark);
+ if (c->no_orphs)
+ c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
+ else
+ c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_NO_ORPHS);
+ err = ubifs_write_master(c);
+ mutex_unlock(&c->mst_mutex);
+ if (err)
+ goto out;
+
+ err = ubifs_log_post_commit(c, old_ltail_lnum);
+ if (err)
+ goto out;
+ err = ubifs_gc_end_commit(c);
+ if (err)
+ goto out;
+ err = ubifs_lpt_post_commit(c);
+ if (err)
+ goto out;
+
+ spin_lock(&c->cs_lock);
+ c->cmt_state = COMMIT_RESTING;
+ wake_up(&c->cmt_wq);
+ dbg_cmt("commit end");
+ spin_unlock(&c->cs_lock);
+
+ return 0;
+
+out_up:
+ up_write(&c->commit_sem);
+out:
+ ubifs_err("commit failed, error %d", err);
+ spin_lock(&c->cs_lock);
+ c->cmt_state = COMMIT_BROKEN;
+ wake_up(&c->cmt_wq);
+ spin_unlock(&c->cs_lock);
+ ubifs_ro_mode(c, err);
+ return err;
+}
+
+/**
+ * run_bg_commit - run background commit if it is needed.
+ * @c: UBIFS file-system description object
+ *
+ * This function runs background commit if it is needed. Returns zero in case
+ * of success and a negative error code in case of failure.
+ */
+static int run_bg_commit(struct ubifs_info *c)
+{
+ spin_lock(&c->cs_lock);
+ /*
+ * Run background commit only if background commit was requested or if
+ * commit is required.
+ */
+ if (c->cmt_state != COMMIT_BACKGROUND &&
+ c->cmt_state != COMMIT_REQUIRED)
+ goto out;
+ spin_unlock(&c->cs_lock);
+
+ down_write(&c->commit_sem);
+ spin_lock(&c->cs_lock);
+ if (c->cmt_state == COMMIT_REQUIRED)
+ c->cmt_state = COMMIT_RUNNING_REQUIRED;
+ else if (c->cmt_state == COMMIT_BACKGROUND)
+ c->cmt_state = COMMIT_RUNNING_BACKGROUND;
+ else
+ goto out_cmt_unlock;
+ spin_unlock(&c->cs_lock);
+
+ return do_commit(c);
+
+out_cmt_unlock:
+ up_write(&c->commit_sem);
+out:
+ spin_unlock(&c->cs_lock);
+ return 0;
+}
+
+/**
+ * ubifs_bg_thread - UBIFS background thread function.
+ * @info: points to the file-system description object
+ *
+ * This function implements various file-system background activities:
+ * o when a write-buffer timer expires it synchronizes the appropriate
+ * write-buffer;
+ * o when the journal is about to be full, it starts in-advance commit.
+ *
+ * Note, other stuff like background garbage collection may be added here in
+ * future.
+ */
+int ubifs_bg_thread(void *info)
+{
+ int err;
+ struct ubifs_info *c = info;
+
+ ubifs_msg("background thread \"%s\" started, PID %d",
+ c->bgt_name, current->pid);
+ set_freezable();
+
+ while (1) {
+ if (kthread_should_stop())
+ break;
+
+ if (try_to_freeze())
+ continue;
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ /* Check if there is something to do */
+ if (!c->need_bgt) {
+ /*
+ * Nothing prevents us from going sleep now and
+ * be never woken up and block the task which
+ * could wait in 'kthread_stop()' forever.
+ */
+ if (kthread_should_stop())
+ break;
+ schedule();
+ continue;
+ } else
+ __set_current_state(TASK_RUNNING);
+
+ c->need_bgt = 0;
+ err = ubifs_bg_wbufs_sync(c);
+ if (err)
+ ubifs_ro_mode(c, err);
+
+ run_bg_commit(c);
+ cond_resched();
+ }
+
+ dbg_msg("background thread \"%s\" stops", c->bgt_name);
+ return 0;
+}
+
+/**
+ * ubifs_commit_required - set commit state to "required".
+ * @c: UBIFS file-system description object
+ *
+ * This function is called if a commit is required but cannot be done from the
+ * calling function, so it is just flagged instead.
+ */
+void ubifs_commit_required(struct ubifs_info *c)
+{
+ spin_lock(&c->cs_lock);
+ switch (c->cmt_state) {
+ case COMMIT_RESTING:
+ case COMMIT_BACKGROUND:
+ dbg_cmt("old: %s, new: %s", dbg_cstate(c->cmt_state),
+ dbg_cstate(COMMIT_REQUIRED));
+ c->cmt_state = COMMIT_REQUIRED;
+ break;
+ case COMMIT_RUNNING_BACKGROUND:
+ dbg_cmt("old: %s, new: %s", dbg_cstate(c->cmt_state),
+ dbg_cstate(COMMIT_RUNNING_REQUIRED));
+ c->cmt_state = COMMIT_RUNNING_REQUIRED;
+ break;
+ case COMMIT_REQUIRED:
+ case COMMIT_RUNNING_REQUIRED:
+ case COMMIT_BROKEN:
+ break;
+ }
+ spin_unlock(&c->cs_lock);
+}
+
+/**
+ * ubifs_request_bg_commit - notify the background thread to do a commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function is called if the journal is full enough to make a commit
+ * worthwhile, so background thread is kicked to start it.
+ */
+void ubifs_request_bg_commit(struct ubifs_info *c)
+{
+ spin_lock(&c->cs_lock);
+ if (c->cmt_state == COMMIT_RESTING) {
+ dbg_cmt("old: %s, new: %s", dbg_cstate(c->cmt_state),
+ dbg_cstate(COMMIT_BACKGROUND));
+ c->cmt_state = COMMIT_BACKGROUND;
+ spin_unlock(&c->cs_lock);
+ ubifs_wake_up_bgt(c);
+ } else
+ spin_unlock(&c->cs_lock);
+}
+
+/**
+ * wait_for_commit - wait for commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function sleeps until the commit operation is no longer running.
+ */
+static int wait_for_commit(struct ubifs_info *c)
+{
+ dbg_cmt("pid %d goes sleep", current->pid);
+
+ /*
+ * The following sleeps if the condition is false, and will be woken
+ * when the commit ends. It is possible, although very unlikely, that we
+ * will wake up and see the subsequent commit running, rather than the
+ * one we were waiting for, and go back to sleep. However, we will be
+ * woken again, so there is no danger of sleeping forever.
+ */
+ wait_event(c->cmt_wq, c->cmt_state != COMMIT_RUNNING_BACKGROUND &&
+ c->cmt_state != COMMIT_RUNNING_REQUIRED);
+ dbg_cmt("commit finished, pid %d woke up", current->pid);
+ return 0;
+}
+
+/**
+ * ubifs_run_commit - run or wait for commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function runs commit and returns zero in case of success and a negative
+ * error code in case of failure.
+ */
+int ubifs_run_commit(struct ubifs_info *c)
+{
+ int err = 0;
+
+ spin_lock(&c->cs_lock);
+ if (c->cmt_state == COMMIT_BROKEN) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (c->cmt_state == COMMIT_RUNNING_BACKGROUND)
+ /*
+ * We set the commit state to 'running required' to indicate
+ * that we want it to complete as quickly as possible.
+ */
+ c->cmt_state = COMMIT_RUNNING_REQUIRED;
+
+ if (c->cmt_state == COMMIT_RUNNING_REQUIRED) {
+ spin_unlock(&c->cs_lock);
+ return wait_for_commit(c);
+ }
+ spin_unlock(&c->cs_lock);
+
+ /* Ok, the commit is indeed needed */
+
+ down_write(&c->commit_sem);
+ spin_lock(&c->cs_lock);
+ /*
+ * Since we unlocked 'c->cs_lock', the state may have changed, so
+ * re-check it.
+ */
+ if (c->cmt_state == COMMIT_BROKEN) {
+ err = -EINVAL;
+ goto out_cmt_unlock;
+ }
+
+ if (c->cmt_state == COMMIT_RUNNING_BACKGROUND)
+ c->cmt_state = COMMIT_RUNNING_REQUIRED;
+
+ if (c->cmt_state == COMMIT_RUNNING_REQUIRED) {
+ up_write(&c->commit_sem);
+ spin_unlock(&c->cs_lock);
+ return wait_for_commit(c);
+ }
+ c->cmt_state = COMMIT_RUNNING_REQUIRED;
+ spin_unlock(&c->cs_lock);
+
+ err = do_commit(c);
+ return err;
+
+out_cmt_unlock:
+ up_write(&c->commit_sem);
+out:
+ spin_unlock(&c->cs_lock);
+ return err;
+}
+
+/**
+ * ubifs_gc_should_commit - determine if it is time for GC to run commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function is called by garbage collection to determine if commit should
+ * be run. If commit state is @COMMIT_BACKGROUND, which means that the journal
+ * is full enough to start commit, this function returns true. It is not
+ * absolutely necessary to commit yet, but it feels like this should be better
+ * then to keep doing GC. This function returns %1 if GC has to initiate commit
+ * and %0 if not.
+ */
+int ubifs_gc_should_commit(struct ubifs_info *c)
+{
+ int ret = 0;
+
+ spin_lock(&c->cs_lock);
+ if (c->cmt_state == COMMIT_BACKGROUND) {
+ dbg_cmt("commit required now");
+ c->cmt_state = COMMIT_REQUIRED;
+ } else
+ dbg_cmt("commit not requested");
+ if (c->cmt_state == COMMIT_REQUIRED)
+ ret = 1;
+ spin_unlock(&c->cs_lock);
+ return ret;
+}
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+
+/**
+ * struct idx_node - hold index nodes during index tree traversal.
+ * @list: list
+ * @iip: index in parent (slot number of this indexing node in the parent
+ * indexing node)
+ * @upper_key: all keys in this indexing node have to be less or equivalent to
+ * this key
+ * @idx: index node (8-byte aligned because all node structures must be 8-byte
+ * aligned)
+ */
+struct idx_node {
+ struct list_head list;
+ int iip;
+ union ubifs_key upper_key;
+ struct ubifs_idx_node idx __attribute__((aligned(8)));
+};
+
+/**
+ * dbg_old_index_check_init - get information for the next old index check.
+ * @c: UBIFS file-system description object
+ * @zroot: root of the index
+ *
+ * This function records information about the index that will be needed for the
+ * next old index check i.e. 'dbg_check_old_index()'.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot)
+{
+ struct ubifs_idx_node *idx;
+ int lnum, offs, len, err = 0;
+
+ c->old_zroot = *zroot;
+
+ lnum = c->old_zroot.lnum;
+ offs = c->old_zroot.offs;
+ len = c->old_zroot.len;
+
+ idx = kmalloc(c->max_idx_node_sz, GFP_NOFS);
+ if (!idx)
+ return -ENOMEM;
+
+ err = ubifs_read_node(c, idx, UBIFS_IDX_NODE, len, lnum, offs);
+ if (err)
+ goto out;
+
+ c->old_zroot_level = le16_to_cpu(idx->level);
+ c->old_zroot_sqnum = le64_to_cpu(idx->ch.sqnum);
+out:
+ kfree(idx);
+ return err;
+}
+
+/**
+ * dbg_check_old_index - check the old copy of the index.
+ * @c: UBIFS file-system description object
+ * @zroot: root of the new index
+ *
+ * In order to be able to recover from an unclean unmount, a complete copy of
+ * the index must exist on flash. This is the "old" index. The commit process
+ * must write the "new" index to flash without overwriting or destroying any
+ * part of the old index. This function is run at commit end in order to check
+ * that the old index does indeed exist completely intact.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
+{
+ int lnum, offs, len, err = 0, uninitialized_var(last_level), child_cnt;
+ int first = 1, iip;
+ union ubifs_key lower_key, upper_key, l_key, u_key;
+ unsigned long long uninitialized_var(last_sqnum);
+ struct ubifs_idx_node *idx;
+ struct list_head list;
+ struct idx_node *i;
+ size_t sz;
+
+ if (!(ubifs_chk_flags & UBIFS_CHK_OLD_IDX))
+ goto out;
+
+ INIT_LIST_HEAD(&list);
+
+ sz = sizeof(struct idx_node) + ubifs_idx_node_sz(c, c->fanout) -
+ UBIFS_IDX_NODE_SZ;
+
+ /* Start at the old zroot */
+ lnum = c->old_zroot.lnum;
+ offs = c->old_zroot.offs;
+ len = c->old_zroot.len;
+ iip = 0;
+
+ /*
+ * Traverse the index tree preorder depth-first i.e. do a node and then
+ * its subtrees from left to right.
+ */
+ while (1) {
+ struct ubifs_branch *br;
+
+ /* Get the next index node */
+ i = kmalloc(sz, GFP_NOFS);
+ if (!i) {
+ err = -ENOMEM;
+ goto out_free;
+ }
+ i->iip = iip;
+ /* Keep the index nodes on our path in a linked list */
+ list_add_tail(&i->list, &list);
+ /* Read the index node */
+ idx = &i->idx;
+ err = ubifs_read_node(c, idx, UBIFS_IDX_NODE, len, lnum, offs);
+ if (err)
+ goto out_free;
+ /* Validate index node */
+ child_cnt = le16_to_cpu(idx->child_cnt);
+ if (child_cnt < 1 || child_cnt > c->fanout) {
+ err = 1;
+ goto out_dump;
+ }
+ if (first) {
+ first = 0;
+ /* Check root level and sqnum */
+ if (le16_to_cpu(idx->level) != c->old_zroot_level) {
+ err = 2;
+ goto out_dump;
+ }
+ if (le64_to_cpu(idx->ch.sqnum) != c->old_zroot_sqnum) {
+ err = 3;
+ goto out_dump;
+ }
+ /* Set last values as though root had a parent */
+ last_level = le16_to_cpu(idx->level) + 1;
+ last_sqnum = le64_to_cpu(idx->ch.sqnum) + 1;
+ key_read(c, ubifs_idx_key(c, idx), &lower_key);
+ highest_ino_key(c, &upper_key, INUM_WATERMARK);
+ }
+ key_copy(c, &upper_key, &i->upper_key);
+ if (le16_to_cpu(idx->level) != last_level - 1) {
+ err = 3;
+ goto out_dump;
+ }
+ /*
+ * The index is always written bottom up hence a child's sqnum
+ * is always less than the parents.
+ */
+ if (le64_to_cpu(idx->ch.sqnum) >= last_sqnum) {
+ err = 4;
+ goto out_dump;
+ }
+ /* Check key range */
+ key_read(c, ubifs_idx_key(c, idx), &l_key);
+ br = ubifs_idx_branch(c, idx, child_cnt - 1);
+ key_read(c, &br->key, &u_key);
+ if (keys_cmp(c, &lower_key, &l_key) > 0) {
+ err = 5;
+ goto out_dump;
+ }
+ if (keys_cmp(c, &upper_key, &u_key) < 0) {
+ err = 6;
+ goto out_dump;
+ }
+ if (keys_cmp(c, &upper_key, &u_key) == 0)
+ if (!is_hash_key(c, &u_key)) {
+ err = 7;
+ goto out_dump;
+ }
+ /* Go to next index node */
+ if (le16_to_cpu(idx->level) == 0) {
+ /* At the bottom, so go up until can go right */
+ while (1) {
+ /* Drop the bottom of the list */
+ list_del(&i->list);
+ kfree(i);
+ /* No more list means we are done */
+ if (list_empty(&list))
+ goto out;
+ /* Look at the new bottom */
+ i = list_entry(list.prev, struct idx_node,
+ list);
+ idx = &i->idx;
+ /* Can we go right */
+ if (iip + 1 < le16_to_cpu(idx->child_cnt)) {
+ iip = iip + 1;
+ break;
+ } else
+ /* Nope, so go up again */
+ iip = i->iip;
+ }
+ } else
+ /* Go down left */
+ iip = 0;
+ /*
+ * We have the parent in 'idx' and now we set up for reading the
+ * child pointed to by slot 'iip'.
+ */
+ last_level = le16_to_cpu(idx->level);
+ last_sqnum = le64_to_cpu(idx->ch.sqnum);
+ br = ubifs_idx_branch(c, idx, iip);
+ lnum = le32_to_cpu(br->lnum);
+ offs = le32_to_cpu(br->offs);
+ len = le32_to_cpu(br->len);
+ key_read(c, &br->key, &lower_key);
+ if (iip + 1 < le16_to_cpu(idx->child_cnt)) {
+ br = ubifs_idx_branch(c, idx, iip + 1);
+ key_read(c, &br->key, &upper_key);
+ } else
+ key_copy(c, &i->upper_key, &upper_key);
+ }
+out:
+ err = dbg_old_index_check_init(c, zroot);
+ if (err)
+ goto out_free;
+
+ return 0;
+
+out_dump:
+ dbg_err("dumping index node (iip=%d)", i->iip);
+ dbg_dump_node(c, idx);
+ list_del(&i->list);
+ kfree(i);
+ if (!list_empty(&list)) {
+ i = list_entry(list.prev, struct idx_node, list);
+ dbg_err("dumping parent index node");
+ dbg_dump_node(c, &i->idx);
+ }
+out_free:
+ while (!list_empty(&list)) {
+ i = list_entry(list.next, struct idx_node, list);
+ list_del(&i->list);
+ kfree(i);
+ }
+ ubifs_err("failed, error %d", err);
+ if (err > 0)
+ err = -EINVAL;
+ return err;
+}
+
+#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c
new file mode 100644
index 00000000000..5bb51dac3c1
--- /dev/null
+++ b/fs/ubifs/compress.c
@@ -0,0 +1,253 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ * Copyright (C) 2006, 2007 University of Szeged, Hungary
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ * Artem Bityutskiy (Битюцкий Артём)
+ * Zoltan Sogor
+ */
+
+/*
+ * This file provides a single place to access to compression and
+ * decompression.
+ */
+
+#include <linux/crypto.h>
+#include "ubifs.h"
+
+/* Fake description object for the "none" compressor */
+static struct ubifs_compressor none_compr = {
+ .compr_type = UBIFS_COMPR_NONE,
+ .name = "no compression",
+ .capi_name = "",
+};
+
+#ifdef CONFIG_UBIFS_FS_LZO
+static DEFINE_MUTEX(lzo_mutex);
+
+static struct ubifs_compressor lzo_compr = {
+ .compr_type = UBIFS_COMPR_LZO,
+ .comp_mutex = &lzo_mutex,
+ .name = "LZO",
+ .capi_name = "lzo",
+};
+#else
+static struct ubifs_compressor lzo_compr = {
+ .compr_type = UBIFS_COMPR_LZO,
+ .name = "LZO",
+};
+#endif
+
+#ifdef CONFIG_UBIFS_FS_ZLIB
+static DEFINE_MUTEX(deflate_mutex);
+static DEFINE_MUTEX(inflate_mutex);
+
+static struct ubifs_compressor zlib_compr = {
+ .compr_type = UBIFS_COMPR_ZLIB,
+ .comp_mutex = &deflate_mutex,
+ .decomp_mutex = &inflate_mutex,
+ .name = "zlib",
+ .capi_name = "deflate",
+};
+#else
+static struct ubifs_compressor zlib_compr = {
+ .compr_type = UBIFS_COMPR_ZLIB,
+ .name = "zlib",
+};
+#endif
+
+/* All UBIFS compressors */
+struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
+
+/**
+ * ubifs_compress - compress data.
+ * @in_buf: data to compress
+ * @in_len: length of the data to compress
+ * @out_buf: output buffer where compressed data should be stored
+ * @out_len: output buffer length is returned here
+ * @compr_type: type of compression to use on enter, actually used compression
+ * type on exit
+ *
+ * This function compresses input buffer @in_buf of length @in_len and stores
+ * the result in the output buffer @out_buf and the resulting length in
+ * @out_len. If the input buffer does not compress, it is just copied to the
+ * @out_buf. The same happens if @compr_type is %UBIFS_COMPR_NONE or if
+ * compression error occurred.
+ *
+ * Note, if the input buffer was not compressed, it is copied to the output
+ * buffer and %UBIFS_COMPR_NONE is returned in @compr_type.
+ *
+ * This functions returns %0 on success or a negative error code on failure.
+ */
+void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
+ int *compr_type)
+{
+ int err;
+ struct ubifs_compressor *compr = ubifs_compressors[*compr_type];
+
+ if (*compr_type == UBIFS_COMPR_NONE)
+ goto no_compr;
+
+ /* If the input data is small, do not even try to compress it */
+ if (in_len < UBIFS_MIN_COMPR_LEN)
+ goto no_compr;
+
+ if (compr->comp_mutex)
+ mutex_lock(compr->comp_mutex);
+ err = crypto_comp_compress(compr->cc, in_buf, in_len, out_buf,
+ out_len);
+ if (compr->comp_mutex)
+ mutex_unlock(compr->comp_mutex);
+ if (unlikely(err)) {
+ ubifs_warn("cannot compress %d bytes, compressor %s, "
+ "error %d, leave data uncompressed",
+ in_len, compr->name, err);
+ goto no_compr;
+ }
+
+ /*
+ * Presently, we just require that compression results in less data,
+ * rather than any defined minimum compression ratio or amount.
+ */
+ if (ALIGN(*out_len, 8) >= ALIGN(in_len, 8))
+ goto no_compr;
+
+ return;
+
+no_compr:
+ memcpy(out_buf, in_buf, in_len);
+ *out_len = in_len;
+ *compr_type = UBIFS_COMPR_NONE;
+}
+
+/**
+ * ubifs_decompress - decompress data.
+ * @in_buf: data to decompress
+ * @in_len: length of the data to decompress
+ * @out_buf: output buffer where decompressed data should
+ * @out_len: output length is returned here
+ * @compr_type: type of compression
+ *
+ * This function decompresses data from buffer @in_buf into buffer @out_buf.
+ * The length of the uncompressed data is returned in @out_len. This functions
+ * returns %0 on success or a negative error code on failure.
+ */
+int ubifs_decompress(const void *in_buf, int in_len, void *out_buf,
+ int *out_len, int compr_type)
+{
+ int err;
+ struct ubifs_compressor *compr;
+
+ if (unlikely(compr_type < 0 || compr_type >= UBIFS_COMPR_TYPES_CNT)) {
+ ubifs_err("invalid compression type %d", compr_type);
+ return -EINVAL;
+ }
+
+ compr = ubifs_compressors[compr_type];
+
+ if (unlikely(!compr->capi_name)) {
+ ubifs_err("%s compression is not compiled in", compr->name);
+ return -EINVAL;
+ }
+
+ if (compr_type == UBIFS_COMPR_NONE) {
+ memcpy(out_buf, in_buf, in_len);
+ *out_len = in_len;
+ return 0;
+ }
+
+ if (compr->decomp_mutex)
+ mutex_lock(compr->decomp_mutex);
+ err = crypto_comp_decompress(compr->cc, in_buf, in_len, out_buf,
+ out_len);
+ if (compr->decomp_mutex)
+ mutex_unlock(compr->decomp_mutex);
+ if (err)
+ ubifs_err("cannot decompress %d bytes, compressor %s, "
+ "error %d", in_len, compr->name, err);
+
+ return err;
+}
+
+/**
+ * compr_init - initialize a compressor.
+ * @compr: compressor description object
+ *
+ * This function initializes the requested compressor and returns zero in case
+ * of success or a negative error code in case of failure.
+ */
+static int __init compr_init(struct ubifs_compressor *compr)
+{
+ if (compr->capi_name) {
+ compr->cc = crypto_alloc_comp(compr->capi_name, 0, 0);
+ if (IS_ERR(compr->cc)) {
+ ubifs_err("cannot initialize compressor %s, error %ld",
+ compr->name, PTR_ERR(compr->cc));
+ return PTR_ERR(compr->cc);
+ }
+ }
+
+ ubifs_compressors[compr->compr_type] = compr;
+ return 0;
+}
+
+/**
+ * compr_exit - de-initialize a compressor.
+ * @compr: compressor description object
+ */
+static void compr_exit(struct ubifs_compressor *compr)
+{
+ if (compr->capi_name)
+ crypto_free_comp(compr->cc);
+ return;
+}
+
+/**
+ * ubifs_compressors_init - initialize UBIFS compressors.
+ *
+ * This function initializes the compressor which were compiled in. Returns
+ * zero in case of success and a negative error code in case of failure.
+ */
+int __init ubifs_compressors_init(void)
+{
+ int err;
+
+ err = compr_init(&lzo_compr);
+ if (err)
+ return err;
+
+ err = compr_init(&zlib_compr);
+ if (err)
+ goto out_lzo;
+
+ ubifs_compressors[UBIFS_COMPR_NONE] = &none_compr;
+ return 0;
+
+out_lzo:
+ compr_exit(&lzo_compr);
+ return err;
+}
+
+/**
+ * ubifs_compressors_exit - de-initialize UBIFS compressors.
+ */
+void __exit ubifs_compressors_exit(void)
+{
+ compr_exit(&lzo_compr);
+ compr_exit(&zlib_compr);
+}
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
new file mode 100644
index 00000000000..d7f7645779f
--- /dev/null
+++ b/fs/ubifs/debug.c
@@ -0,0 +1,2290 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ * Adrian Hunter
+ */
+
+/*
+ * This file implements most of the debugging stuff which is compiled in only
+ * when it is enabled. But some debugging check functions are implemented in
+ * corresponding subsystem, just because they are closely related and utilize
+ * various local functions of those subsystems.
+ */
+
+#define UBIFS_DBG_PRESERVE_UBI
+
+#include "ubifs.h"
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+
+DEFINE_SPINLOCK(dbg_lock);
+
+static char dbg_key_buf0[128];
+static char dbg_key_buf1[128];
+
+unsigned int ubifs_msg_flags = UBIFS_MSG_FLAGS_DEFAULT;
+unsigned int ubifs_chk_flags = UBIFS_CHK_FLAGS_DEFAULT;
+unsigned int ubifs_tst_flags;
+
+module_param_named(debug_msgs, ubifs_msg_flags, uint, S_IRUGO | S_IWUSR);
+module_param_named(debug_chks, ubifs_chk_flags, uint, S_IRUGO | S_IWUSR);
+module_param_named(debug_tsts, ubifs_tst_flags, uint, S_IRUGO | S_IWUSR);
+
+MODULE_PARM_DESC(debug_msgs, "Debug message type flags");
+MODULE_PARM_DESC(debug_chks, "Debug check flags");
+MODULE_PARM_DESC(debug_tsts, "Debug special test flags");
+
+static const char *get_key_fmt(int fmt)
+{
+ switch (fmt) {
+ case UBIFS_SIMPLE_KEY_FMT:
+ return "simple";
+ default:
+ return "unknown/invalid format";
+ }
+}
+
+static const char *get_key_hash(int hash)
+{
+ switch (hash) {
+ case UBIFS_KEY_HASH_R5:
+ return "R5";
+ case UBIFS_KEY_HASH_TEST:
+ return "test";
+ default:
+ return "unknown/invalid name hash";
+ }
+}
+
+static const char *get_key_type(int type)
+{
+ switch (type) {
+ case UBIFS_INO_KEY:
+ return "inode";
+ case UBIFS_DENT_KEY:
+ return "direntry";
+ case UBIFS_XENT_KEY:
+ return "xentry";
+ case UBIFS_DATA_KEY:
+ return "data";
+ case UBIFS_TRUN_KEY:
+ return "truncate";
+ default:
+ return "unknown/invalid key";
+ }
+}
+
+static void sprintf_key(const struct ubifs_info *c, const union ubifs_key *key,
+ char *buffer)
+{
+ char *p = buffer;
+ int type = key_type(c, key);
+
+ if (c->key_fmt == UBIFS_SIMPLE_KEY_FMT) {
+ switch (type) {
+ case UBIFS_INO_KEY:
+ sprintf(p, "(%lu, %s)", key_inum(c, key),
+ get_key_type(type));
+ break;
+ case UBIFS_DENT_KEY:
+ case UBIFS_XENT_KEY:
+ sprintf(p, "(%lu, %s, %#08x)", key_inum(c, key),
+ get_key_type(type), key_hash(c, key));
+ break;
+ case UBIFS_DATA_KEY:
+ sprintf(p, "(%lu, %s, %u)", key_inum(c, key),
+ get_key_type(type), key_block(c, key));
+ break;
+ case UBIFS_TRUN_KEY:
+ sprintf(p, "(%lu, %s)",
+ key_inum(c, key), get_key_type(type));
+ break;
+ default:
+ sprintf(p, "(bad key type: %#08x, %#08x)",
+ key->u32[0], key->u32[1]);
+ }
+ } else
+ sprintf(p, "bad key format %d", c->key_fmt);
+}
+
+const char *dbg_key_str0(const struct ubifs_info *c, const union ubifs_key *key)
+{
+ /* dbg_lock must be held */
+ sprintf_key(c, key, dbg_key_buf0);
+ return dbg_key_buf0;
+}
+
+const char *dbg_key_str1(const struct ubifs_info *c, const union ubifs_key *key)
+{
+ /* dbg_lock must be held */
+ sprintf_key(c, key, dbg_key_buf1);
+ return dbg_key_buf1;
+}
+
+const char *dbg_ntype(int type)
+{
+ switch (type) {
+ case UBIFS_PAD_NODE:
+ return "padding node";
+ case UBIFS_SB_NODE:
+ return "superblock node";
+ case UBIFS_MST_NODE:
+ return "master node";
+ case UBIFS_REF_NODE:
+ return "reference node";
+ case UBIFS_INO_NODE:
+ return "inode node";
+ case UBIFS_DENT_NODE:
+ return "direntry node";
+ case UBIFS_XENT_NODE:
+ return "xentry node";
+ case UBIFS_DATA_NODE:
+ return "data node";
+ case UBIFS_TRUN_NODE:
+ return "truncate node";
+ case UBIFS_IDX_NODE:
+ return "indexing node";
+ case UBIFS_CS_NODE:
+ return "commit start node";
+ case UBIFS_ORPH_NODE:
+ return "orphan node";
+ default:
+ return "unknown node";
+ }
+}
+
+static const char *dbg_gtype(int type)
+{
+ switch (type) {
+ case UBIFS_NO_NODE_GROUP:
+ return "no node group";
+ case UBIFS_IN_NODE_GROUP:
+ return "in node group";
+ case UBIFS_LAST_OF_NODE_GROUP:
+ return "last of node group";
+ default:
+ return "unknown";
+ }
+}
+
+const char *dbg_cstate(int cmt_state)
+{
+ switch (cmt_state) {
+ case COMMIT_RESTING:
+ return "commit resting";
+ case COMMIT_BACKGROUND:
+ return "background commit requested";
+ case COMMIT_REQUIRED:
+ return "commit required";
+ case COMMIT_RUNNING_BACKGROUND:
+ return "BACKGROUND commit running";
+ case COMMIT_RUNNING_REQUIRED:
+ return "commit running and required";
+ case COMMIT_BROKEN:
+ return "broken commit";
+ default:
+ return "unknown commit state";
+ }
+}
+
+static void dump_ch(const struct ubifs_ch *ch)
+{
+ printk(KERN_DEBUG "\tmagic %#x\n", le32_to_cpu(ch->magic));
+ printk(KERN_DEBUG "\tcrc %#x\n", le32_to_cpu(ch->crc));
+ printk(KERN_DEBUG "\tnode_type %d (%s)\n", ch->node_type,
+ dbg_ntype(ch->node_type));
+ printk(KERN_DEBUG "\tgroup_type %d (%s)\n", ch->group_type,
+ dbg_gtype(ch->group_type));
+ printk(KERN_DEBUG "\tsqnum %llu\n",
+ (unsigned long long)le64_to_cpu(ch->sqnum));
+ printk(KERN_DEBUG "\tlen %u\n", le32_to_cpu(ch->len));
+}
+
+void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode)
+{
+ const struct ubifs_inode *ui = ubifs_inode(inode);
+
+ printk(KERN_DEBUG "inode %lu\n", inode->i_ino);
+ printk(KERN_DEBUG "size %llu\n",
+ (unsigned long long)i_size_read(inode));
+ printk(KERN_DEBUG "nlink %u\n", inode->i_nlink);
+ printk(KERN_DEBUG "uid %u\n", (unsigned int)inode->i_uid);
+ printk(KERN_DEBUG "gid %u\n", (unsigned int)inode->i_gid);
+ printk(KERN_DEBUG "atime %u.%u\n",
+ (unsigned int)inode->i_atime.tv_sec,
+ (unsigned int)inode->i_atime.tv_nsec);
+ printk(KERN_DEBUG "mtime %u.%u\n",
+ (unsigned int)inode->i_mtime.tv_sec,
+ (unsigned int)inode->i_mtime.tv_nsec);
+ printk(KERN_DEBUG "ctime %u.%u\n",
+ (unsigned int)inode->i_ctime.tv_sec,
+ (unsigned int)inode->i_ctime.tv_nsec);
+ printk(KERN_DEBUG "creat_sqnum %llu\n", ui->creat_sqnum);
+ printk(KERN_DEBUG "xattr_size %u\n", ui->xattr_size);
+ printk(KERN_DEBUG "xattr_cnt %u\n", ui->xattr_cnt);
+ printk(KERN_DEBUG "xattr_names %u\n", ui->xattr_names);
+ printk(KERN_DEBUG "dirty %u\n", ui->dirty);
+ printk(KERN_DEBUG "xattr %u\n", ui->xattr);
+ printk(KERN_DEBUG "flags %d\n", ui->flags);
+ printk(KERN_DEBUG "compr_type %d\n", ui->compr_type);
+ printk(KERN_DEBUG "data_len %d\n", ui->data_len);
+}
+
+void dbg_dump_node(const struct ubifs_info *c, const void *node)
+{
+ int i, n;
+ union ubifs_key key;
+ const struct ubifs_ch *ch = node;
+
+ if (dbg_failure_mode)
+ return;
+
+ /* If the magic is incorrect, just hexdump the first bytes */
+ if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC) {
+ printk(KERN_DEBUG "Not a node, first %zu bytes:", UBIFS_CH_SZ);
+ print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 1,
+ (void *)node, UBIFS_CH_SZ, 1);
+ return;
+ }
+
+ spin_lock(&dbg_lock);
+ dump_ch(node);
+
+ switch (ch->node_type) {
+ case UBIFS_PAD_NODE:
+ {
+ const struct ubifs_pad_node *pad = node;
+
+ printk(KERN_DEBUG "\tpad_len %u\n",
+ le32_to_cpu(pad->pad_len));
+ break;
+ }
+ case UBIFS_SB_NODE:
+ {
+ const struct ubifs_sb_node *sup = node;
+ unsigned int sup_flags = le32_to_cpu(sup->flags);
+
+ printk(KERN_DEBUG "\tkey_hash %d (%s)\n",
+ (int)sup->key_hash, get_key_hash(sup->key_hash));
+ printk(KERN_DEBUG "\tkey_fmt %d (%s)\n",
+ (int)sup->key_fmt, get_key_fmt(sup->key_fmt));
+ printk(KERN_DEBUG "\tflags %#x\n", sup_flags);
+ printk(KERN_DEBUG "\t big_lpt %u\n",
+ !!(sup_flags & UBIFS_FLG_BIGLPT));
+ printk(KERN_DEBUG "\tmin_io_size %u\n",
+ le32_to_cpu(sup->min_io_size));
+ printk(KERN_DEBUG "\tleb_size %u\n",
+ le32_to_cpu(sup->leb_size));
+ printk(KERN_DEBUG "\tleb_cnt %u\n",
+ le32_to_cpu(sup->leb_cnt));
+ printk(KERN_DEBUG "\tmax_leb_cnt %u\n",
+ le32_to_cpu(sup->max_leb_cnt));
+ printk(KERN_DEBUG "\tmax_bud_bytes %llu\n",
+ (unsigned long long)le64_to_cpu(sup->max_bud_bytes));
+ printk(KERN_DEBUG "\tlog_lebs %u\n",
+ le32_to_cpu(sup->log_lebs));
+ printk(KERN_DEBUG "\tlpt_lebs %u\n",
+ le32_to_cpu(sup->lpt_lebs));
+ printk(KERN_DEBUG "\torph_lebs %u\n",
+ le32_to_cpu(sup->orph_lebs));
+ printk(KERN_DEBUG "\tjhead_cnt %u\n",
+ le32_to_cpu(sup->jhead_cnt));
+ printk(KERN_DEBUG "\tfanout %u\n",
+ le32_to_cpu(sup->fanout));
+ printk(KERN_DEBUG "\tlsave_cnt %u\n",
+ le32_to_cpu(sup->lsave_cnt));
+ printk(KERN_DEBUG "\tdefault_compr %u\n",
+ (int)le16_to_cpu(sup->default_compr));
+ printk(KERN_DEBUG "\trp_size %llu\n",
+ (unsigned long long)le64_to_cpu(sup->rp_size));
+ printk(KERN_DEBUG "\trp_uid %u\n",
+ le32_to_cpu(sup->rp_uid));
+ printk(KERN_DEBUG "\trp_gid %u\n",
+ le32_to_cpu(sup->rp_gid));
+ printk(KERN_DEBUG "\tfmt_version %u\n",
+ le32_to_cpu(sup->fmt_version));
+ printk(KERN_DEBUG "\ttime_gran %u\n",
+ le32_to_cpu(sup->time_gran));
+ printk(KERN_DEBUG "\tUUID %02X%02X%02X%02X-%02X%02X"
+ "-%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X\n",
+ sup->uuid[0], sup->uuid[1], sup->uuid[2], sup->uuid[3],
+ sup->uuid[4], sup->uuid[5], sup->uuid[6], sup->uuid[7],
+ sup->uuid[8], sup->uuid[9], sup->uuid[10], sup->uuid[11],
+ sup->uuid[12], sup->uuid[13], sup->uuid[14],
+ sup->uuid[15]);
+ break;
+ }
+ case UBIFS_MST_NODE:
+ {
+ const struct ubifs_mst_node *mst = node;
+
+ printk(KERN_DEBUG "\thighest_inum %llu\n",
+ (unsigned long long)le64_to_cpu(mst->highest_inum));
+ printk(KERN_DEBUG "\tcommit number %llu\n",
+ (unsigned long long)le64_to_cpu(mst->cmt_no));
+ printk(KERN_DEBUG "\tflags %#x\n",
+ le32_to_cpu(mst->flags));
+ printk(KERN_DEBUG "\tlog_lnum %u\n",
+ le32_to_cpu(mst->log_lnum));
+ printk(KERN_DEBUG "\troot_lnum %u\n",
+ le32_to_cpu(mst->root_lnum));
+ printk(KERN_DEBUG "\troot_offs %u\n",
+ le32_to_cpu(mst->root_offs));
+ printk(KERN_DEBUG "\troot_len %u\n",
+ le32_to_cpu(mst->root_len));
+ printk(KERN_DEBUG "\tgc_lnum %u\n",
+ le32_to_cpu(mst->gc_lnum));
+ printk(KERN_DEBUG "\tihead_lnum %u\n",
+ le32_to_cpu(mst->ihead_lnum));
+ printk(KERN_DEBUG "\tihead_offs %u\n",
+ le32_to_cpu(mst->ihead_offs));
+ printk(KERN_DEBUG "\tindex_size %u\n",
+ le32_to_cpu(mst->index_size));
+ printk(KERN_DEBUG "\tlpt_lnum %u\n",
+ le32_to_cpu(mst->lpt_lnum));
+ printk(KERN_DEBUG "\tlpt_offs %u\n",
+ le32_to_cpu(mst->lpt_offs));
+ printk(KERN_DEBUG "\tnhead_lnum %u\n",
+ le32_to_cpu(mst->nhead_lnum));
+ printk(KERN_DEBUG "\tnhead_offs %u\n",
+ le32_to_cpu(mst->nhead_offs));
+ printk(KERN_DEBUG "\tltab_lnum %u\n",
+ le32_to_cpu(mst->ltab_lnum));
+ printk(KERN_DEBUG "\tltab_offs %u\n",
+ le32_to_cpu(mst->ltab_offs));
+ printk(KERN_DEBUG "\tlsave_lnum %u\n",
+ le32_to_cpu(mst->lsave_lnum));
+ printk(KERN_DEBUG "\tlsave_offs %u\n",
+ le32_to_cpu(mst->lsave_offs));
+ printk(KERN_DEBUG "\tlscan_lnum %u\n",
+ le32_to_cpu(mst->lscan_lnum));
+ printk(KERN_DEBUG "\tleb_cnt %u\n",
+ le32_to_cpu(mst->leb_cnt));
+ printk(KERN_DEBUG "\tempty_lebs %u\n",
+ le32_to_cpu(mst->empty_lebs));
+ printk(KERN_DEBUG "\tidx_lebs %u\n",
+ le32_to_cpu(mst->idx_lebs));
+ printk(KERN_DEBUG "\ttotal_free %llu\n",
+ (unsigned long long)le64_to_cpu(mst->total_free));
+ printk(KERN_DEBUG "\ttotal_dirty %llu\n",
+ (unsigned long long)le64_to_cpu(mst->total_dirty));
+ printk(KERN_DEBUG "\ttotal_used %llu\n",
+ (unsigned long long)le64_to_cpu(mst->total_used));
+ printk(KERN_DEBUG "\ttotal_dead %llu\n",
+ (unsigned long long)le64_to_cpu(mst->total_dead));
+ printk(KERN_DEBUG "\ttotal_dark %llu\n",
+ (unsigned long long)le64_to_cpu(mst->total_dark));
+ break;
+ }
+ case UBIFS_REF_NODE:
+ {
+ const struct ubifs_ref_node *ref = node;
+
+ printk(KERN_DEBUG "\tlnum %u\n",
+ le32_to_cpu(ref->lnum));
+ printk(KERN_DEBUG "\toffs %u\n",
+ le32_to_cpu(ref->offs));
+ printk(KERN_DEBUG "\tjhead %u\n",
+ le32_to_cpu(ref->jhead));
+ break;
+ }
+ case UBIFS_INO_NODE:
+ {
+ const struct ubifs_ino_node *ino = node;
+
+ key_read(c, &ino->key, &key);
+ printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key));
+ printk(KERN_DEBUG "\tcreat_sqnum %llu\n",
+ (unsigned long long)le64_to_cpu(ino->creat_sqnum));
+ printk(KERN_DEBUG "\tsize %llu\n",
+ (unsigned long long)le64_to_cpu(ino->size));
+ printk(KERN_DEBUG "\tnlink %u\n",
+ le32_to_cpu(ino->nlink));
+ printk(KERN_DEBUG "\tatime %lld.%u\n",
+ (long long)le64_to_cpu(ino->atime_sec),
+ le32_to_cpu(ino->atime_nsec));
+ printk(KERN_DEBUG "\tmtime %lld.%u\n",
+ (long long)le64_to_cpu(ino->mtime_sec),
+ le32_to_cpu(ino->mtime_nsec));
+ printk(KERN_DEBUG "\tctime %lld.%u\n",
+ (long long)le64_to_cpu(ino->ctime_sec),
+ le32_to_cpu(ino->ctime_nsec));
+ printk(KERN_DEBUG "\tuid %u\n",
+ le32_to_cpu(ino->uid));
+ printk(KERN_DEBUG "\tgid %u\n",
+ le32_to_cpu(ino->gid));
+ printk(KERN_DEBUG "\tmode %u\n",
+ le32_to_cpu(ino->mode));
+ printk(KERN_DEBUG "\tflags %#x\n",
+ le32_to_cpu(ino->flags));
+ printk(KERN_DEBUG "\txattr_cnt %u\n",
+ le32_to_cpu(ino->xattr_cnt));
+ printk(KERN_DEBUG "\txattr_size %u\n",
+ le32_to_cpu(ino->xattr_size));
+ printk(KERN_DEBUG "\txattr_names %u\n",
+ le32_to_cpu(ino->xattr_names));
+ printk(KERN_DEBUG "\tcompr_type %#x\n",
+ (int)le16_to_cpu(ino->compr_type));
+ printk(KERN_DEBUG "\tdata len %u\n",
+ le32_to_cpu(ino->data_len));
+ break;
+ }
+ case UBIFS_DENT_NODE:
+ case UBIFS_XENT_NODE:
+ {
+ const struct ubifs_dent_node *dent = node;
+ int nlen = le16_to_cpu(dent->nlen);
+
+ key_read(c, &dent->key, &key);
+ printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key));
+ printk(KERN_DEBUG "\tinum %llu\n",
+ (unsigned long long)le64_to_cpu(dent->inum));
+ printk(KERN_DEBUG "\ttype %d\n", (int)dent->type);
+ printk(KERN_DEBUG "\tnlen %d\n", nlen);
+ printk(KERN_DEBUG "\tname ");
+
+ if (nlen > UBIFS_MAX_NLEN)
+ printk(KERN_DEBUG "(bad name length, not printing, "
+ "bad or corrupted node)");
+ else {
+ for (i = 0; i < nlen && dent->name[i]; i++)
+ printk("%c", dent->name[i]);
+ }
+ printk("\n");
+
+ break;
+ }
+ case UBIFS_DATA_NODE:
+ {
+ const struct ubifs_data_node *dn = node;
+ int dlen = le32_to_cpu(ch->len) - UBIFS_DATA_NODE_SZ;
+
+ key_read(c, &dn->key, &key);
+ printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key));
+ printk(KERN_DEBUG "\tsize %u\n",
+ le32_to_cpu(dn->size));
+ printk(KERN_DEBUG "\tcompr_typ %d\n",
+ (int)le16_to_cpu(dn->compr_type));
+ printk(KERN_DEBUG "\tdata size %d\n",
+ dlen);
+ printk(KERN_DEBUG "\tdata:\n");
+ print_hex_dump(KERN_DEBUG, "\t", DUMP_PREFIX_OFFSET, 32, 1,
+ (void *)&dn->data, dlen, 0);
+ break;
+ }
+ case UBIFS_TRUN_NODE:
+ {
+ const struct ubifs_trun_node *trun = node;
+
+ printk(KERN_DEBUG "\tinum %u\n",
+ le32_to_cpu(trun->inum));
+ printk(KERN_DEBUG "\told_size %llu\n",
+ (unsigned long long)le64_to_cpu(trun->old_size));
+ printk(KERN_DEBUG "\tnew_size %llu\n",
+ (unsigned long long)le64_to_cpu(trun->new_size));
+ break;
+ }
+ case UBIFS_IDX_NODE:
+ {
+ const struct ubifs_idx_node *idx = node;
+
+ n = le16_to_cpu(idx->child_cnt);
+ printk(KERN_DEBUG "\tchild_cnt %d\n", n);
+ printk(KERN_DEBUG "\tlevel %d\n",
+ (int)le16_to_cpu(idx->level));
+ printk(KERN_DEBUG "\tBranches:\n");
+
+ for (i = 0; i < n && i < c->fanout - 1; i++) {
+ const struct ubifs_branch *br;
+
+ br = ubifs_idx_branch(c, idx, i);
+ key_read(c, &br->key, &key);
+ printk(KERN_DEBUG "\t%d: LEB %d:%d len %d key %s\n",
+ i, le32_to_cpu(br->lnum), le32_to_cpu(br->offs),
+ le32_to_cpu(br->len), DBGKEY(&key));
+ }
+ break;
+ }
+ case UBIFS_CS_NODE:
+ break;
+ case UBIFS_ORPH_NODE:
+ {
+ const struct ubifs_orph_node *orph = node;
+
+ printk(KERN_DEBUG "\tcommit number %llu\n",
+ (unsigned long long)
+ le64_to_cpu(orph->cmt_no) & LLONG_MAX);
+ printk(KERN_DEBUG "\tlast node flag %llu\n",
+ (unsigned long long)(le64_to_cpu(orph->cmt_no)) >> 63);
+ n = (le32_to_cpu(ch->len) - UBIFS_ORPH_NODE_SZ) >> 3;
+ printk(KERN_DEBUG "\t%d orphan inode numbers:\n", n);
+ for (i = 0; i < n; i++)
+ printk(KERN_DEBUG "\t ino %llu\n",
+ (unsigned long long)le64_to_cpu(orph->inos[i]));
+ break;
+ }
+ default:
+ printk(KERN_DEBUG "node type %d was not recognized\n",
+ (int)ch->node_type);
+ }
+ spin_unlock(&dbg_lock);
+}
+
+void dbg_dump_budget_req(const struct ubifs_budget_req *req)
+{
+ spin_lock(&dbg_lock);
+ printk(KERN_DEBUG "Budgeting request: new_ino %d, dirtied_ino %d\n",
+ req->new_ino, req->dirtied_ino);
+ printk(KERN_DEBUG "\tnew_ino_d %d, dirtied_ino_d %d\n",
+ req->new_ino_d, req->dirtied_ino_d);
+ printk(KERN_DEBUG "\tnew_page %d, dirtied_page %d\n",
+ req->new_page, req->dirtied_page);
+ printk(KERN_DEBUG "\tnew_dent %d, mod_dent %d\n",
+ req->new_dent, req->mod_dent);
+ printk(KERN_DEBUG "\tidx_growth %d\n", req->idx_growth);
+ printk(KERN_DEBUG "\tdata_growth %d dd_growth %d\n",
+ req->data_growth, req->dd_growth);
+ spin_unlock(&dbg_lock);
+}
+
+void dbg_dump_lstats(const struct ubifs_lp_stats *lst)
+{
+ spin_lock(&dbg_lock);
+ printk(KERN_DEBUG "(pid %d) Lprops statistics: empty_lebs %d, "
+ "idx_lebs %d\n", current->pid, lst->empty_lebs, lst->idx_lebs);
+ printk(KERN_DEBUG "\ttaken_empty_lebs %d, total_free %lld, "
+ "total_dirty %lld\n", lst->taken_empty_lebs, lst->total_free,
+ lst->total_dirty);
+ printk(KERN_DEBUG "\ttotal_used %lld, total_dark %lld, "
+ "total_dead %lld\n", lst->total_used, lst->total_dark,
+ lst->total_dead);
+ spin_unlock(&dbg_lock);
+}
+
+void dbg_dump_budg(struct ubifs_info *c)
+{
+ int i;
+ struct rb_node *rb;
+ struct ubifs_bud *bud;
+ struct ubifs_gced_idx_leb *idx_gc;
+
+ spin_lock(&dbg_lock);
+ printk(KERN_DEBUG "(pid %d) Budgeting info: budg_data_growth %lld, "
+ "budg_dd_growth %lld, budg_idx_growth %lld\n", current->pid,
+ c->budg_data_growth, c->budg_dd_growth, c->budg_idx_growth);
+ printk(KERN_DEBUG "\tdata budget sum %lld, total budget sum %lld, "
+ "freeable_cnt %d\n", c->budg_data_growth + c->budg_dd_growth,
+ c->budg_data_growth + c->budg_dd_growth + c->budg_idx_growth,
+ c->freeable_cnt);
+ printk(KERN_DEBUG "\tmin_idx_lebs %d, old_idx_sz %lld, "
+ "calc_idx_sz %lld, idx_gc_cnt %d\n", c->min_idx_lebs,
+ c->old_idx_sz, c->calc_idx_sz, c->idx_gc_cnt);
+ printk(KERN_DEBUG "\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, "
+ "clean_zn_cnt %ld\n", atomic_long_read(&c->dirty_pg_cnt),
+ atomic_long_read(&c->dirty_zn_cnt),
+ atomic_long_read(&c->clean_zn_cnt));
+ printk(KERN_DEBUG "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n",
+ c->dark_wm, c->dead_wm, c->max_idx_node_sz);
+ printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n",
+ c->gc_lnum, c->ihead_lnum);
+ for (i = 0; i < c->jhead_cnt; i++)
+ printk(KERN_DEBUG "\tjhead %d\t LEB %d\n",
+ c->jheads[i].wbuf.jhead, c->jheads[i].wbuf.lnum);
+ for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) {
+ bud = rb_entry(rb, struct ubifs_bud, rb);
+ printk(KERN_DEBUG "\tbud LEB %d\n", bud->lnum);
+ }
+ list_for_each_entry(bud, &c->old_buds, list)
+ printk(KERN_DEBUG "\told bud LEB %d\n", bud->lnum);
+ list_for_each_entry(idx_gc, &c->idx_gc, list)
+ printk(KERN_DEBUG "\tGC'ed idx LEB %d unmap %d\n",
+ idx_gc->lnum, idx_gc->unmap);
+ printk(KERN_DEBUG "\tcommit state %d\n", c->cmt_state);
+ spin_unlock(&dbg_lock);
+}
+
+void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
+{
+ printk(KERN_DEBUG "LEB %d lprops: free %d, dirty %d (used %d), "
+ "flags %#x\n", lp->lnum, lp->free, lp->dirty,
+ c->leb_size - lp->free - lp->dirty, lp->flags);
+}
+
+void dbg_dump_lprops(struct ubifs_info *c)
+{
+ int lnum, err;
+ struct ubifs_lprops lp;
+ struct ubifs_lp_stats lst;
+
+ printk(KERN_DEBUG "(pid %d) Dumping LEB properties\n", current->pid);
+ ubifs_get_lp_stats(c, &lst);
+ dbg_dump_lstats(&lst);
+
+ for (lnum = c->main_first; lnum < c->leb_cnt; lnum++) {
+ err = ubifs_read_one_lp(c, lnum, &lp);
+ if (err)
+ ubifs_err("cannot read lprops for LEB %d", lnum);
+
+ dbg_dump_lprop(c, &lp);
+ }
+}
+
+void dbg_dump_leb(const struct ubifs_info *c, int lnum)
+{
+ struct ubifs_scan_leb *sleb;
+ struct ubifs_scan_node *snod;
+
+ if (dbg_failure_mode)
+ return;
+
+ printk(KERN_DEBUG "(pid %d) Dumping LEB %d\n", current->pid, lnum);
+
+ sleb = ubifs_scan(c, lnum, 0, c->dbg_buf);
+ if (IS_ERR(sleb)) {
+ ubifs_err("scan error %d", (int)PTR_ERR(sleb));
+ return;
+ }
+
+ printk(KERN_DEBUG "LEB %d has %d nodes ending at %d\n", lnum,
+ sleb->nodes_cnt, sleb->endpt);
+
+ list_for_each_entry(snod, &sleb->nodes, list) {
+ cond_resched();
+ printk(KERN_DEBUG "Dumping node at LEB %d:%d len %d\n", lnum,
+ snod->offs, snod->len);
+ dbg_dump_node(c, snod->node);
+ }
+
+ ubifs_scan_destroy(sleb);
+ return;
+}
+
+void dbg_dump_znode(const struct ubifs_info *c,
+ const struct ubifs_znode *znode)
+{
+ int n;
+ const struct ubifs_zbranch *zbr;
+
+ spin_lock(&dbg_lock);
+ if (znode->parent)
+ zbr = &znode->parent->zbranch[znode->iip];
+ else
+ zbr = &c->zroot;
+
+ printk(KERN_DEBUG "znode %p, LEB %d:%d len %d parent %p iip %d level %d"
+ " child_cnt %d flags %lx\n", znode, zbr->lnum, zbr->offs,
+ zbr->len, znode->parent, znode->iip, znode->level,
+ znode->child_cnt, znode->flags);
+
+ if (znode->child_cnt <= 0 || znode->child_cnt > c->fanout) {
+ spin_unlock(&dbg_lock);
+ return;
+ }
+
+ printk(KERN_DEBUG "zbranches:\n");
+ for (n = 0; n < znode->child_cnt; n++) {
+ zbr = &znode->zbranch[n];
+ if (znode->level > 0)
+ printk(KERN_DEBUG "\t%d: znode %p LEB %d:%d len %d key "
+ "%s\n", n, zbr->znode, zbr->lnum,
+ zbr->offs, zbr->len,
+ DBGKEY(&zbr->key));
+ else
+ printk(KERN_DEBUG "\t%d: LNC %p LEB %d:%d len %d key "
+ "%s\n", n, zbr->znode, zbr->lnum,
+ zbr->offs, zbr->len,
+ DBGKEY(&zbr->key));
+ }
+ spin_unlock(&dbg_lock);
+}
+
+void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat)
+{
+ int i;
+
+ printk(KERN_DEBUG "(pid %d) Dumping heap cat %d (%d elements)\n",
+ current->pid, cat, heap->cnt);
+ for (i = 0; i < heap->cnt; i++) {
+ struct ubifs_lprops *lprops = heap->arr[i];
+
+ printk(KERN_DEBUG "\t%d. LEB %d hpos %d free %d dirty %d "
+ "flags %d\n", i, lprops->lnum, lprops->hpos,
+ lprops->free, lprops->dirty, lprops->flags);
+ }
+}
+
+void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
+ struct ubifs_nnode *parent, int iip)
+{
+ int i;
+
+ printk(KERN_DEBUG "(pid %d) Dumping pnode:\n", current->pid);
+ printk(KERN_DEBUG "\taddress %zx parent %zx cnext %zx\n",
+ (size_t)pnode, (size_t)parent, (size_t)pnode->cnext);
+ printk(KERN_DEBUG "\tflags %lu iip %d level %d num %d\n",
+ pnode->flags, iip, pnode->level, pnode->num);
+ for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+ struct ubifs_lprops *lp = &pnode->lprops[i];
+
+ printk(KERN_DEBUG "\t%d: free %d dirty %d flags %d lnum %d\n",
+ i, lp->free, lp->dirty, lp->flags, lp->lnum);
+ }
+}
+
+void dbg_dump_tnc(struct ubifs_info *c)
+{
+ struct ubifs_znode *znode;
+ int level;
+
+ printk(KERN_DEBUG "\n");
+ printk(KERN_DEBUG "(pid %d) Dumping the TNC tree\n", current->pid);
+ znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL);
+ level = znode->level;
+ printk(KERN_DEBUG "== Level %d ==\n", level);
+ while (znode) {
+ if (level != znode->level) {
+ level = znode->level;
+ printk(KERN_DEBUG "== Level %d ==\n", level);
+ }
+ dbg_dump_znode(c, znode);
+ znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode);
+ }
+
+ printk(KERN_DEBUG "\n");
+}
+
+static int dump_znode(struct ubifs_info *c, struct ubifs_znode *znode,
+ void *priv)
+{
+ dbg_dump_znode(c, znode);
+ return 0;
+}
+
+/**
+ * dbg_dump_index - dump the on-flash index.
+ * @c: UBIFS file-system description object
+ *
+ * This function dumps whole UBIFS indexing B-tree, unlike 'dbg_dump_tnc()'
+ * which dumps only in-memory znodes and does not read znodes which from flash.
+ */
+void dbg_dump_index(struct ubifs_info *c)
+{
+ dbg_walk_index(c, NULL, dump_znode, NULL);
+}
+
+/**
+ * dbg_check_synced_i_size - check synchronized inode size.
+ * @inode: inode to check
+ *
+ * If inode is clean, synchronized inode size has to be equivalent to current
+ * inode size. This function has to be called only for locked inodes (@i_mutex
+ * has to be locked). Returns %0 if synchronized inode size if correct, and
+ * %-EINVAL if not.
+ */
+int dbg_check_synced_i_size(struct inode *inode)
+{
+ int err = 0;
+ struct ubifs_inode *ui = ubifs_inode(inode);
+
+ if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+ return 0;
+ if (!S_ISREG(inode->i_mode))
+ return 0;
+
+ mutex_lock(&ui->ui_mutex);
+ spin_lock(&ui->ui_lock);
+ if (ui->ui_size != ui->synced_i_size && !ui->dirty) {
+ ubifs_err("ui_size is %lld, synced_i_size is %lld, but inode "
+ "is clean", ui->ui_size, ui->synced_i_size);
+ ubifs_err("i_ino %lu, i_mode %#x, i_size %lld", inode->i_ino,
+ inode->i_mode, i_size_read(inode));
+ dbg_dump_stack();
+ err = -EINVAL;
+ }
+ spin_unlock(&ui->ui_lock);
+ mutex_unlock(&ui->ui_mutex);
+ return err;
+}
+
+/*
+ * dbg_check_dir - check directory inode size and link count.
+ * @c: UBIFS file-system description object
+ * @dir: the directory to calculate size for
+ * @size: the result is returned here
+ *
+ * This function makes sure that directory size and link count are correct.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ *
+ * Note, it is good idea to make sure the @dir->i_mutex is locked before
+ * calling this function.
+ */
+int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir)
+{
+ unsigned int nlink = 2;
+ union ubifs_key key;
+ struct ubifs_dent_node *dent, *pdent = NULL;
+ struct qstr nm = { .name = NULL };
+ loff_t size = UBIFS_INO_NODE_SZ;
+
+ if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+ return 0;
+
+ if (!S_ISDIR(dir->i_mode))
+ return 0;
+
+ lowest_dent_key(c, &key, dir->i_ino);
+ while (1) {
+ int err;
+
+ dent = ubifs_tnc_next_ent(c, &key, &nm);
+ if (IS_ERR(dent)) {
+ err = PTR_ERR(dent);
+ if (err == -ENOENT)
+ break;
+ return err;
+ }
+
+ nm.name = dent->name;
+ nm.len = le16_to_cpu(dent->nlen);
+ size += CALC_DENT_SIZE(nm.len);
+ if (dent->type == UBIFS_ITYPE_DIR)
+ nlink += 1;
+ kfree(pdent);
+ pdent = dent;
+ key_read(c, &dent->key, &key);
+ }
+ kfree(pdent);
+
+ if (i_size_read(dir) != size) {
+ ubifs_err("directory inode %lu has size %llu, "
+ "but calculated size is %llu", dir->i_ino,
+ (unsigned long long)i_size_read(dir),
+ (unsigned long long)size);
+ dump_stack();
+ return -EINVAL;
+ }
+ if (dir->i_nlink != nlink) {
+ ubifs_err("directory inode %lu has nlink %u, but calculated "
+ "nlink is %u", dir->i_ino, dir->i_nlink, nlink);
+ dump_stack();
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/**
+ * dbg_check_key_order - make sure that colliding keys are properly ordered.
+ * @c: UBIFS file-system description object
+ * @zbr1: first zbranch
+ * @zbr2: following zbranch
+ *
+ * In UBIFS indexing B-tree colliding keys has to be sorted in binary order of
+ * names of the direntries/xentries which are referred by the keys. This
+ * function reads direntries/xentries referred by @zbr1 and @zbr2 and makes
+ * sure the name of direntry/xentry referred by @zbr1 is less than
+ * direntry/xentry referred by @zbr2. Returns zero if this is true, %1 if not,
+ * and a negative error code in case of failure.
+ */
+static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
+ struct ubifs_zbranch *zbr2)
+{
+ int err, nlen1, nlen2, cmp;
+ struct ubifs_dent_node *dent1, *dent2;
+ union ubifs_key key;
+
+ ubifs_assert(!keys_cmp(c, &zbr1->key, &zbr2->key));
+ dent1 = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
+ if (!dent1)
+ return -ENOMEM;
+ dent2 = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
+ if (!dent2) {
+ err = -ENOMEM;
+ goto out_free;
+ }
+
+ err = ubifs_tnc_read_node(c, zbr1, dent1);
+ if (err)
+ goto out_free;
+ err = ubifs_validate_entry(c, dent1);
+ if (err)
+ goto out_free;
+
+ err = ubifs_tnc_read_node(c, zbr2, dent2);
+ if (err)
+ goto out_free;
+ err = ubifs_validate_entry(c, dent2);
+ if (err)
+ goto out_free;
+
+ /* Make sure node keys are the same as in zbranch */
+ err = 1;
+ key_read(c, &dent1->key, &key);
+ if (keys_cmp(c, &zbr1->key, &key)) {
+ dbg_err("1st entry at %d:%d has key %s", zbr1->lnum,
+ zbr1->offs, DBGKEY(&key));
+ dbg_err("but it should have key %s according to tnc",
+ DBGKEY(&zbr1->key));
+ dbg_dump_node(c, dent1);
+ goto out_free;
+ }
+
+ key_read(c, &dent2->key, &key);
+ if (keys_cmp(c, &zbr2->key, &key)) {
+ dbg_err("2nd entry at %d:%d has key %s", zbr1->lnum,
+ zbr1->offs, DBGKEY(&key));
+ dbg_err("but it should have key %s according to tnc",
+ DBGKEY(&zbr2->key));
+ dbg_dump_node(c, dent2);
+ goto out_free;
+ }
+
+ nlen1 = le16_to_cpu(dent1->nlen);
+ nlen2 = le16_to_cpu(dent2->nlen);
+
+ cmp = memcmp(dent1->name, dent2->name, min_t(int, nlen1, nlen2));
+ if (cmp < 0 || (cmp == 0 && nlen1 < nlen2)) {
+ err = 0;
+ goto out_free;
+ }
+ if (cmp == 0 && nlen1 == nlen2)
+ dbg_err("2 xent/dent nodes with the same name");
+ else
+ dbg_err("bad order of colliding key %s",
+ DBGKEY(&key));
+
+ dbg_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs);
+ dbg_dump_node(c, dent1);
+ dbg_msg("second node at %d:%d\n", zbr2->lnum, zbr2->offs);
+ dbg_dump_node(c, dent2);
+
+out_free:
+ kfree(dent2);
+ kfree(dent1);
+ return err;
+}
+
+/**
+ * dbg_check_znode - check if znode is all right.
+ * @c: UBIFS file-system description object
+ * @zbr: zbranch which points to this znode
+ *
+ * This function makes sure that znode referred to by @zbr is all right.
+ * Returns zero if it is, and %-EINVAL if it is not.
+ */
+static int dbg_check_znode(struct ubifs_info *c, struct ubifs_zbranch *zbr)
+{
+ struct ubifs_znode *znode = zbr->znode;
+ struct ubifs_znode *zp = znode->parent;
+ int n, err, cmp;
+
+ if (znode->child_cnt <= 0 || znode->child_cnt > c->fanout) {
+ err = 1;
+ goto out;
+ }
+ if (znode->level < 0) {
+ err = 2;
+ goto out;
+ }
+ if (znode->iip < 0 || znode->iip >= c->fanout) {
+ err = 3;
+ goto out;
+ }
+
+ if (zbr->len == 0)
+ /* Only dirty zbranch may have no on-flash nodes */
+ if (!ubifs_zn_dirty(znode)) {
+ err = 4;
+ goto out;
+ }
+
+ if (ubifs_zn_dirty(znode)) {
+ /*
+ * If znode is dirty, its parent has to be dirty as well. The
+ * order of the operation is important, so we have to have
+ * memory barriers.
+ */
+ smp_mb();
+ if (zp && !ubifs_zn_dirty(zp)) {
+ /*
+ * The dirty flag is atomic and is cleared outside the
+ * TNC mutex, so znode's dirty flag may now have
+ * been cleared. The child is always cleared before the
+ * parent, so we just need to check again.
+ */
+ smp_mb();
+ if (ubifs_zn_dirty(znode)) {
+ err = 5;
+ goto out;
+ }
+ }
+ }
+
+ if (zp) {
+ const union ubifs_key *min, *max;
+
+ if (znode->level != zp->level - 1) {
+ err = 6;
+ goto out;
+ }
+
+ /* Make sure the 'parent' pointer in our znode is correct */
+ err = ubifs_search_zbranch(c, zp, &zbr->key, &n);
+ if (!err) {
+ /* This zbranch does not exist in the parent */
+ err = 7;
+ goto out;
+ }
+
+ if (znode->iip >= zp->child_cnt) {
+ err = 8;
+ goto out;
+ }
+
+ if (znode->iip != n) {
+ /* This may happen only in case of collisions */
+ if (keys_cmp(c, &zp->zbranch[n].key,
+ &zp->zbranch[znode->iip].key)) {
+ err = 9;
+ goto out;
+ }
+ n = znode->iip;
+ }
+
+ /*
+ * Make sure that the first key in our znode is greater than or
+ * equal to the key in the pointing zbranch.
+ */
+ min = &zbr->key;
+ cmp = keys_cmp(c, min, &znode->zbranch[0].key);
+ if (cmp == 1) {
+ err = 10;
+ goto out;
+ }
+
+ if (n + 1 < zp->child_cnt) {
+ max = &zp->zbranch[n + 1].key;
+
+ /*
+ * Make sure the last key in our znode is less or
+ * equivalent than the the key in zbranch which goes
+ * after our pointing zbranch.
+ */
+ cmp = keys_cmp(c, max,
+ &znode->zbranch[znode->child_cnt - 1].key);
+ if (cmp == -1) {
+ err = 11;
+ goto out;
+ }
+ }
+ } else {
+ /* This may only be root znode */
+ if (zbr != &c->zroot) {
+ err = 12;
+ goto out;
+ }
+ }
+
+ /*
+ * Make sure that next key is greater or equivalent then the previous
+ * one.
+ */
+ for (n = 1; n < znode->child_cnt; n++) {
+ cmp = keys_cmp(c, &znode->zbranch[n - 1].key,
+ &znode->zbranch[n].key);
+ if (cmp > 0) {
+ err = 13;
+ goto out;
+ }
+ if (cmp == 0) {
+ /* This can only be keys with colliding hash */
+ if (!is_hash_key(c, &znode->zbranch[n].key)) {
+ err = 14;
+ goto out;
+ }
+
+ if (znode->level != 0 || c->replaying)
+ continue;
+
+ /*
+ * Colliding keys should follow binary order of
+ * corresponding xentry/dentry names.
+ */
+ err = dbg_check_key_order(c, &znode->zbranch[n - 1],
+ &znode->zbranch[n]);
+ if (err < 0)
+ return err;
+ if (err) {
+ err = 15;
+ goto out;
+ }
+ }
+ }
+
+ for (n = 0; n < znode->child_cnt; n++) {
+ if (!znode->zbranch[n].znode &&
+ (znode->zbranch[n].lnum == 0 ||
+ znode->zbranch[n].len == 0)) {
+ err = 16;
+ goto out;
+ }
+
+ if (znode->zbranch[n].lnum != 0 &&
+ znode->zbranch[n].len == 0) {
+ err = 17;
+ goto out;
+ }
+
+ if (znode->zbranch[n].lnum == 0 &&
+ znode->zbranch[n].len != 0) {
+ err = 18;
+ goto out;
+ }
+
+ if (znode->zbranch[n].lnum == 0 &&
+ znode->zbranch[n].offs != 0) {
+ err = 19;
+ goto out;
+ }
+
+ if (znode->level != 0 && znode->zbranch[n].znode)
+ if (znode->zbranch[n].znode->parent != znode) {
+ err = 20;
+ goto out;
+ }
+ }
+
+ return 0;
+
+out:
+ ubifs_err("failed, error %d", err);
+ ubifs_msg("dump of the znode");
+ dbg_dump_znode(c, znode);
+ if (zp) {
+ ubifs_msg("dump of the parent znode");
+ dbg_dump_znode(c, zp);
+ }
+ dump_stack();
+ return -EINVAL;
+}
+
+/**
+ * dbg_check_tnc - check TNC tree.
+ * @c: UBIFS file-system description object
+ * @extra: do extra checks that are possible at start commit
+ *
+ * This function traverses whole TNC tree and checks every znode. Returns zero
+ * if everything is all right and %-EINVAL if something is wrong with TNC.
+ */
+int dbg_check_tnc(struct ubifs_info *c, int extra)
+{
+ struct ubifs_znode *znode;
+ long clean_cnt = 0, dirty_cnt = 0;
+ int err, last;
+
+ if (!(ubifs_chk_flags & UBIFS_CHK_TNC))
+ return 0;
+
+ ubifs_assert(mutex_is_locked(&c->tnc_mutex));
+ if (!c->zroot.znode)
+ return 0;
+
+ znode = ubifs_tnc_postorder_first(c->zroot.znode);
+ while (1) {
+ struct ubifs_znode *prev;
+ struct ubifs_zbranch *zbr;
+
+ if (!znode->parent)
+ zbr = &c->zroot;
+ else
+ zbr = &znode->parent->zbranch[znode->iip];
+
+ err = dbg_check_znode(c, zbr);
+ if (err)
+ return err;
+
+ if (extra) {
+ if (ubifs_zn_dirty(znode))
+ dirty_cnt += 1;
+ else
+ clean_cnt += 1;
+ }
+
+ prev = znode;
+ znode = ubifs_tnc_postorder_next(znode);
+ if (!znode)
+ break;
+
+ /*
+ * If the last key of this znode is equivalent to the first key
+ * of the next znode (collision), then check order of the keys.
+ */
+ last = prev->child_cnt - 1;
+ if (prev->level == 0 && znode->level == 0 && !c->replaying &&
+ !keys_cmp(c, &prev->zbranch[last].key,
+ &znode->zbranch[0].key)) {
+ err = dbg_check_key_order(c, &prev->zbranch[last],
+ &znode->zbranch[0]);
+ if (err < 0)
+ return err;
+ if (err) {
+ ubifs_msg("first znode");
+ dbg_dump_znode(c, prev);
+ ubifs_msg("second znode");
+ dbg_dump_znode(c, znode);
+ return -EINVAL;
+ }
+ }
+ }
+
+ if (extra) {
+ if (clean_cnt != atomic_long_read(&c->clean_zn_cnt)) {
+ ubifs_err("incorrect clean_zn_cnt %ld, calculated %ld",
+ atomic_long_read(&c->clean_zn_cnt),
+ clean_cnt);
+ return -EINVAL;
+ }
+ if (dirty_cnt != atomic_long_read(&c->dirty_zn_cnt)) {
+ ubifs_err("incorrect dirty_zn_cnt %ld, calculated %ld",
+ atomic_long_read(&c->dirty_zn_cnt),
+ dirty_cnt);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * dbg_walk_index - walk the on-flash index.
+ * @c: UBIFS file-system description object
+ * @leaf_cb: called for each leaf node
+ * @znode_cb: called for each indexing node
+ * @priv: private date which is passed to callbacks
+ *
+ * This function walks the UBIFS index and calls the @leaf_cb for each leaf
+ * node and @znode_cb for each indexing node. Returns zero in case of success
+ * and a negative error code in case of failure.
+ *
+ * It would be better if this function removed every znode it pulled to into
+ * the TNC, so that the behavior more closely matched the non-debugging
+ * behavior.
+ */
+int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb,
+ dbg_znode_callback znode_cb, void *priv)
+{
+ int err;
+ struct ubifs_zbranch *zbr;
+ struct ubifs_znode *znode, *child;
+
+ mutex_lock(&c->tnc_mutex);
+ /* If the root indexing node is not in TNC - pull it */
+ if (!c->zroot.znode) {
+ c->zroot.znode = ubifs_load_znode(c, &c->zroot, NULL, 0);
+ if (IS_ERR(c->zroot.znode)) {
+ err = PTR_ERR(c->zroot.znode);
+ c->zroot.znode = NULL;
+ goto out_unlock;
+ }
+ }
+
+ /*
+ * We are going to traverse the indexing tree in the postorder manner.
+ * Go down and find the leftmost indexing node where we are going to
+ * start from.
+ */
+ znode = c->zroot.znode;
+ while (znode->level > 0) {
+ zbr = &znode->zbranch[0];
+ child = zbr->znode;
+ if (!child) {
+ child = ubifs_load_znode(c, zbr, znode, 0);
+ if (IS_ERR(child)) {
+ err = PTR_ERR(child);
+ goto out_unlock;
+ }
+ zbr->znode = child;
+ }
+
+ znode = child;
+ }
+
+ /* Iterate over all indexing nodes */
+ while (1) {
+ int idx;
+
+ cond_resched();
+
+ if (znode_cb) {
+ err = znode_cb(c, znode, priv);
+ if (err) {
+ ubifs_err("znode checking function returned "
+ "error %d", err);
+ dbg_dump_znode(c, znode);
+ goto out_dump;
+ }
+ }
+ if (leaf_cb && znode->level == 0) {
+ for (idx = 0; idx < znode->child_cnt; idx++) {
+ zbr = &znode->zbranch[idx];
+ err = leaf_cb(c, zbr, priv);
+ if (err) {
+ ubifs_err("leaf checking function "
+ "returned error %d, for leaf "
+ "at LEB %d:%d",
+ err, zbr->lnum, zbr->offs);
+ goto out_dump;
+ }
+ }
+ }
+
+ if (!znode->parent)
+ break;
+
+ idx = znode->iip + 1;
+ znode = znode->parent;
+ if (idx < znode->child_cnt) {
+ /* Switch to the next index in the parent */
+ zbr = &znode->zbranch[idx];
+ child = zbr->znode;
+ if (!child) {
+ child = ubifs_load_znode(c, zbr, znode, idx);
+ if (IS_ERR(child)) {
+ err = PTR_ERR(child);
+ goto out_unlock;
+ }
+ zbr->znode = child;
+ }
+ znode = child;
+ } else
+ /*
+ * This is the last child, switch to the parent and
+ * continue.
+ */
+ continue;
+
+ /* Go to the lowest leftmost znode in the new sub-tree */
+ while (znode->level > 0) {
+ zbr = &znode->zbranch[0];
+ child = zbr->znode;
+ if (!child) {
+ child = ubifs_load_znode(c, zbr, znode, 0);
+ if (IS_ERR(child)) {
+ err = PTR_ERR(child);
+ goto out_unlock;
+ }
+ zbr->znode = child;
+ }
+ znode = child;
+ }
+ }
+
+ mutex_unlock(&c->tnc_mutex);
+ return 0;
+
+out_dump:
+ if (znode->parent)
+ zbr = &znode->parent->zbranch[znode->iip];
+ else
+ zbr = &c->zroot;
+ ubifs_msg("dump of znode at LEB %d:%d", zbr->lnum, zbr->offs);
+ dbg_dump_znode(c, znode);
+out_unlock:
+ mutex_unlock(&c->tnc_mutex);
+ return err;
+}
+
+/**
+ * add_size - add znode size to partially calculated index size.
+ * @c: UBIFS file-system description object
+ * @znode: znode to add size for
+ * @priv: partially calculated index size
+ *
+ * This is a helper function for 'dbg_check_idx_size()' which is called for
+ * every indexing node and adds its size to the 'long long' variable pointed to
+ * by @priv.
+ */
+static int add_size(struct ubifs_info *c, struct ubifs_znode *znode, void *priv)
+{
+ long long *idx_size = priv;
+ int add;
+
+ add = ubifs_idx_node_sz(c, znode->child_cnt);
+ add = ALIGN(add, 8);
+ *idx_size += add;
+ return 0;
+}
+
+/**
+ * dbg_check_idx_size - check index size.
+ * @c: UBIFS file-system description object
+ * @idx_size: size to check
+ *
+ * This function walks the UBIFS index, calculates its size and checks that the
+ * size is equivalent to @idx_size. Returns zero in case of success and a
+ * negative error code in case of failure.
+ */
+int dbg_check_idx_size(struct ubifs_info *c, long long idx_size)
+{
+ int err;
+ long long calc = 0;
+
+ if (!(ubifs_chk_flags & UBIFS_CHK_IDX_SZ))
+ return 0;
+
+ err = dbg_walk_index(c, NULL, add_size, &calc);
+ if (err) {
+ ubifs_err("error %d while walking the index", err);
+ return err;
+ }
+
+ if (calc != idx_size) {
+ ubifs_err("index size check failed: calculated size is %lld, "
+ "should be %lld", calc, idx_size);
+ dump_stack();
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/**
+ * struct fsck_inode - information about an inode used when checking the file-system.
+ * @rb: link in the RB-tree of inodes
+ * @inum: inode number
+ * @mode: inode type, permissions, etc
+ * @nlink: inode link count
+ * @xattr_cnt: count of extended attributes
+ * @references: how many directory/xattr entries refer this inode (calculated
+ * while walking the index)
+ * @calc_cnt: for directory inode count of child directories
+ * @size: inode size (read from on-flash inode)
+ * @xattr_sz: summary size of all extended attributes (read from on-flash
+ * inode)
+ * @calc_sz: for directories calculated directory size
+ * @calc_xcnt: count of extended attributes
+ * @calc_xsz: calculated summary size of all extended attributes
+ * @xattr_nms: sum of lengths of all extended attribute names belonging to this
+ * inode (read from on-flash inode)
+ * @calc_xnms: calculated sum of lengths of all extended attribute names
+ */
+struct fsck_inode {
+ struct rb_node rb;
+ ino_t inum;
+ umode_t mode;
+ unsigned int nlink;
+ unsigned int xattr_cnt;
+ int references;
+ int calc_cnt;
+ long long size;
+ unsigned int xattr_sz;
+ long long calc_sz;
+ long long calc_xcnt;
+ long long calc_xsz;
+ unsigned int xattr_nms;
+ long long calc_xnms;
+};
+
+/**
+ * struct fsck_data - private FS checking information.
+ * @inodes: RB-tree of all inodes (contains @struct fsck_inode objects)
+ */
+struct fsck_data {
+ struct rb_root inodes;
+};
+
+/**
+ * add_inode - add inode information to RB-tree of inodes.
+ * @c: UBIFS file-system description object
+ * @fsckd: FS checking information
+ * @ino: raw UBIFS inode to add
+ *
+ * This is a helper function for 'check_leaf()' which adds information about
+ * inode @ino to the RB-tree of inodes. Returns inode information pointer in
+ * case of success and a negative error code in case of failure.
+ */
+static struct fsck_inode *add_inode(struct ubifs_info *c,
+ struct fsck_data *fsckd,
+ struct ubifs_ino_node *ino)
+{
+ struct rb_node **p, *parent = NULL;
+ struct fsck_inode *fscki;
+ ino_t inum = key_inum_flash(c, &ino->key);
+
+ p = &fsckd->inodes.rb_node;
+ while (*p) {
+ parent = *p;
+ fscki = rb_entry(parent, struct fsck_inode, rb);
+ if (inum < fscki->inum)
+ p = &(*p)->rb_left;
+ else if (inum > fscki->inum)
+ p = &(*p)->rb_right;
+ else
+ return fscki;
+ }
+
+ if (inum > c->highest_inum) {
+ ubifs_err("too high inode number, max. is %lu",
+ c->highest_inum);
+ return ERR_PTR(-EINVAL);
+ }
+
+ fscki = kzalloc(sizeof(struct fsck_inode), GFP_NOFS);
+ if (!fscki)
+ return ERR_PTR(-ENOMEM);
+
+ fscki->inum = inum;
+ fscki->nlink = le32_to_cpu(ino->nlink);
+ fscki->size = le64_to_cpu(ino->size);
+ fscki->xattr_cnt = le32_to_cpu(ino->xattr_cnt);
+ fscki->xattr_sz = le32_to_cpu(ino->xattr_size);
+ fscki->xattr_nms = le32_to_cpu(ino->xattr_names);
+ fscki->mode = le32_to_cpu(ino->mode);
+ if (S_ISDIR(fscki->mode)) {
+ fscki->calc_sz = UBIFS_INO_NODE_SZ;
+ fscki->calc_cnt = 2;
+ }
+ rb_link_node(&fscki->rb, parent, p);
+ rb_insert_color(&fscki->rb, &fsckd->inodes);
+ return fscki;
+}
+
+/**
+ * search_inode - search inode in the RB-tree of inodes.
+ * @fsckd: FS checking information
+ * @inum: inode number to search
+ *
+ * This is a helper function for 'check_leaf()' which searches inode @inum in
+ * the RB-tree of inodes and returns an inode information pointer or %NULL if
+ * the inode was not found.
+ */
+static struct fsck_inode *search_inode(struct fsck_data *fsckd, ino_t inum)
+{
+ struct rb_node *p;
+ struct fsck_inode *fscki;
+
+ p = fsckd->inodes.rb_node;
+ while (p) {
+ fscki = rb_entry(p, struct fsck_inode, rb);
+ if (inum < fscki->inum)
+ p = p->rb_left;
+ else if (inum > fscki->inum)
+ p = p->rb_right;
+ else
+ return fscki;
+ }
+ return NULL;
+}
+
+/**
+ * read_add_inode - read inode node and add it to RB-tree of inodes.
+ * @c: UBIFS file-system description object
+ * @fsckd: FS checking information
+ * @inum: inode number to read
+ *
+ * This is a helper function for 'check_leaf()' which finds inode node @inum in
+ * the index, reads it, and adds it to the RB-tree of inodes. Returns inode
+ * information pointer in case of success and a negative error code in case of
+ * failure.
+ */
+static struct fsck_inode *read_add_inode(struct ubifs_info *c,
+ struct fsck_data *fsckd, ino_t inum)
+{
+ int n, err;
+ union ubifs_key key;
+ struct ubifs_znode *znode;
+ struct ubifs_zbranch *zbr;
+ struct ubifs_ino_node *ino;
+ struct fsck_inode *fscki;
+
+ fscki = search_inode(fsckd, inum);
+ if (fscki)
+ return fscki;
+
+ ino_key_init(c, &key, inum);
+ err = ubifs_lookup_level0(c, &key, &znode, &n);
+ if (!err) {
+ ubifs_err("inode %lu not found in index", inum);
+ return ERR_PTR(-ENOENT);
+ } else if (err < 0) {
+ ubifs_err("error %d while looking up inode %lu", err, inum);
+ return ERR_PTR(err);
+ }
+
+ zbr = &znode->zbranch[n];
+ if (zbr->len < UBIFS_INO_NODE_SZ) {
+ ubifs_err("bad node %lu node length %d", inum, zbr->len);
+ return ERR_PTR(-EINVAL);
+ }
+
+ ino = kmalloc(zbr->len, GFP_NOFS);
+ if (!ino)
+ return ERR_PTR(-ENOMEM);
+
+ err = ubifs_tnc_read_node(c, zbr, ino);
+ if (err) {
+ ubifs_err("cannot read inode node at LEB %d:%d, error %d",
+ zbr->lnum, zbr->offs, err);
+ kfree(ino);
+ return ERR_PTR(err);
+ }
+
+ fscki = add_inode(c, fsckd, ino);
+ kfree(ino);
+ if (IS_ERR(fscki)) {
+ ubifs_err("error %ld while adding inode %lu node",
+ PTR_ERR(fscki), inum);
+ return fscki;
+ }
+
+ return fscki;
+}
+
+/**
+ * check_leaf - check leaf node.
+ * @c: UBIFS file-system description object
+ * @zbr: zbranch of the leaf node to check
+ * @priv: FS checking information
+ *
+ * This is a helper function for 'dbg_check_filesystem()' which is called for
+ * every single leaf node while walking the indexing tree. It checks that the
+ * leaf node referred from the indexing tree exists, has correct CRC, and does
+ * some other basic validation. This function is also responsible for building
+ * an RB-tree of inodes - it adds all inodes into the RB-tree. It also
+ * calculates reference count, size, etc for each inode in order to later
+ * compare them to the information stored inside the inodes and detect possible
+ * inconsistencies. Returns zero in case of success and a negative error code
+ * in case of failure.
+ */
+static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+ void *priv)
+{
+ ino_t inum;
+ void *node;
+ struct ubifs_ch *ch;
+ int err, type = key_type(c, &zbr->key);
+ struct fsck_inode *fscki;
+
+ if (zbr->len < UBIFS_CH_SZ) {
+ ubifs_err("bad leaf length %d (LEB %d:%d)",
+ zbr->len, zbr->lnum, zbr->offs);
+ return -EINVAL;
+ }
+
+ node = kmalloc(zbr->len, GFP_NOFS);
+ if (!node)
+ return -ENOMEM;
+
+ err = ubifs_tnc_read_node(c, zbr, node);
+ if (err) {
+ ubifs_err("cannot read leaf node at LEB %d:%d, error %d",
+ zbr->lnum, zbr->offs, err);
+ goto out_free;
+ }
+
+ /* If this is an inode node, add it to RB-tree of inodes */
+ if (type == UBIFS_INO_KEY) {
+ fscki = add_inode(c, priv, node);
+ if (IS_ERR(fscki)) {
+ err = PTR_ERR(fscki);
+ ubifs_err("error %d while adding inode node", err);
+ goto out_dump;
+ }
+ goto out;
+ }
+
+ if (type != UBIFS_DENT_KEY && type != UBIFS_XENT_KEY &&
+ type != UBIFS_DATA_KEY) {
+ ubifs_err("unexpected node type %d at LEB %d:%d",
+ type, zbr->lnum, zbr->offs);
+ err = -EINVAL;
+ goto out_free;
+ }
+
+ ch = node;
+ if (le64_to_cpu(ch->sqnum) > c->max_sqnum) {
+ ubifs_err("too high sequence number, max. is %llu",
+ c->max_sqnum);
+ err = -EINVAL;
+ goto out_dump;
+ }
+
+ if (type == UBIFS_DATA_KEY) {
+ long long blk_offs;
+ struct ubifs_data_node *dn = node;
+
+ /*
+ * Search the inode node this data node belongs to and insert
+ * it to the RB-tree of inodes.
+ */
+ inum = key_inum_flash(c, &dn->key);
+ fscki = read_add_inode(c, priv, inum);
+ if (IS_ERR(fscki)) {
+ err = PTR_ERR(fscki);
+ ubifs_err("error %d while processing data node and "
+ "trying to find inode node %lu", err, inum);
+ goto out_dump;
+ }
+
+ /* Make sure the data node is within inode size */
+ blk_offs = key_block_flash(c, &dn->key);
+ blk_offs <<= UBIFS_BLOCK_SHIFT;
+ blk_offs += le32_to_cpu(dn->size);
+ if (blk_offs > fscki->size) {
+ ubifs_err("data node at LEB %d:%d is not within inode "
+ "size %lld", zbr->lnum, zbr->offs,
+ fscki->size);
+ err = -EINVAL;
+ goto out_dump;
+ }
+ } else {
+ int nlen;
+ struct ubifs_dent_node *dent = node;
+ struct fsck_inode *fscki1;
+
+ err = ubifs_validate_entry(c, dent);
+ if (err)
+ goto out_dump;
+
+ /*
+ * Search the inode node this entry refers to and the parent
+ * inode node and insert them to the RB-tree of inodes.
+ */
+ inum = le64_to_cpu(dent->inum);
+ fscki = read_add_inode(c, priv, inum);
+ if (IS_ERR(fscki)) {
+ err = PTR_ERR(fscki);
+ ubifs_err("error %d while processing entry node and "
+ "trying to find inode node %lu", err, inum);
+ goto out_dump;
+ }
+
+ /* Count how many direntries or xentries refers this inode */
+ fscki->references += 1;
+
+ inum = key_inum_flash(c, &dent->key);
+ fscki1 = read_add_inode(c, priv, inum);
+ if (IS_ERR(fscki1)) {
+ err = PTR_ERR(fscki);
+ ubifs_err("error %d while processing entry node and "
+ "trying to find parent inode node %lu",
+ err, inum);
+ goto out_dump;
+ }
+
+ nlen = le16_to_cpu(dent->nlen);
+ if (type == UBIFS_XENT_KEY) {
+ fscki1->calc_xcnt += 1;
+ fscki1->calc_xsz += CALC_DENT_SIZE(nlen);
+ fscki1->calc_xsz += CALC_XATTR_BYTES(fscki->size);
+ fscki1->calc_xnms += nlen;
+ } else {
+ fscki1->calc_sz += CALC_DENT_SIZE(nlen);
+ if (dent->type == UBIFS_ITYPE_DIR)
+ fscki1->calc_cnt += 1;
+ }
+ }
+
+out:
+ kfree(node);
+ return 0;
+
+out_dump:
+ ubifs_msg("dump of node at LEB %d:%d", zbr->lnum, zbr->offs);
+ dbg_dump_node(c, node);
+out_free:
+ kfree(node);
+ return err;
+}
+
+/**
+ * free_inodes - free RB-tree of inodes.
+ * @fsckd: FS checking information
+ */
+static void free_inodes(struct fsck_data *fsckd)
+{
+ struct rb_node *this = fsckd->inodes.rb_node;
+ struct fsck_inode *fscki;
+
+ while (this) {
+ if (this->rb_left)
+ this = this->rb_left;
+ else if (this->rb_right)
+ this = this->rb_right;
+ else {
+ fscki = rb_entry(this, struct fsck_inode, rb);
+ this = rb_parent(this);
+ if (this) {
+ if (this->rb_left == &fscki->rb)
+ this->rb_left = NULL;
+ else
+ this->rb_right = NULL;
+ }
+ kfree(fscki);
+ }
+ }
+}
+
+/**
+ * check_inodes - checks all inodes.
+ * @c: UBIFS file-system description object
+ * @fsckd: FS checking information
+ *
+ * This is a helper function for 'dbg_check_filesystem()' which walks the
+ * RB-tree of inodes after the index scan has been finished, and checks that
+ * inode nlink, size, etc are correct. Returns zero if inodes are fine,
+ * %-EINVAL if not, and a negative error code in case of failure.
+ */
+static int check_inodes(struct ubifs_info *c, struct fsck_data *fsckd)
+{
+ int n, err;
+ union ubifs_key key;
+ struct ubifs_znode *znode;
+ struct ubifs_zbranch *zbr;
+ struct ubifs_ino_node *ino;
+ struct fsck_inode *fscki;
+ struct rb_node *this = rb_first(&fsckd->inodes);
+
+ while (this) {
+ fscki = rb_entry(this, struct fsck_inode, rb);
+ this = rb_next(this);
+
+ if (S_ISDIR(fscki->mode)) {
+ /*
+ * Directories have to have exactly one reference (they
+ * cannot have hardlinks), although root inode is an
+ * exception.
+ */
+ if (fscki->inum != UBIFS_ROOT_INO &&
+ fscki->references != 1) {
+ ubifs_err("directory inode %lu has %d "
+ "direntries which refer it, but "
+ "should be 1", fscki->inum,
+ fscki->references);
+ goto out_dump;
+ }
+ if (fscki->inum == UBIFS_ROOT_INO &&
+ fscki->references != 0) {
+ ubifs_err("root inode %lu has non-zero (%d) "
+ "direntries which refer it",
+ fscki->inum, fscki->references);
+ goto out_dump;
+ }
+ if (fscki->calc_sz != fscki->size) {
+ ubifs_err("directory inode %lu size is %lld, "
+ "but calculated size is %lld",
+ fscki->inum, fscki->size,
+ fscki->calc_sz);
+ goto out_dump;
+ }
+ if (fscki->calc_cnt != fscki->nlink) {
+ ubifs_err("directory inode %lu nlink is %d, "
+ "but calculated nlink is %d",
+ fscki->inum, fscki->nlink,
+ fscki->calc_cnt);
+ goto out_dump;
+ }
+ } else {
+ if (fscki->references != fscki->nlink) {
+ ubifs_err("inode %lu nlink is %d, but "
+ "calculated nlink is %d", fscki->inum,
+ fscki->nlink, fscki->references);
+ goto out_dump;
+ }
+ }
+ if (fscki->xattr_sz != fscki->calc_xsz) {
+ ubifs_err("inode %lu has xattr size %u, but "
+ "calculated size is %lld",
+ fscki->inum, fscki->xattr_sz,
+ fscki->calc_xsz);
+ goto out_dump;
+ }
+ if (fscki->xattr_cnt != fscki->calc_xcnt) {
+ ubifs_err("inode %lu has %u xattrs, but "
+ "calculated count is %lld", fscki->inum,
+ fscki->xattr_cnt, fscki->calc_xcnt);
+ goto out_dump;
+ }
+ if (fscki->xattr_nms != fscki->calc_xnms) {
+ ubifs_err("inode %lu has xattr names' size %u, but "
+ "calculated names' size is %lld",
+ fscki->inum, fscki->xattr_nms,
+ fscki->calc_xnms);
+ goto out_dump;
+ }
+ }
+
+ return 0;
+
+out_dump:
+ /* Read the bad inode and dump it */
+ ino_key_init(c, &key, fscki->inum);
+ err = ubifs_lookup_level0(c, &key, &znode, &n);
+ if (!err) {
+ ubifs_err("inode %lu not found in index", fscki->inum);
+ return -ENOENT;
+ } else if (err < 0) {
+ ubifs_err("error %d while looking up inode %lu",
+ err, fscki->inum);
+ return err;
+ }
+
+ zbr = &znode->zbranch[n];
+ ino = kmalloc(zbr->len, GFP_NOFS);
+ if (!ino)
+ return -ENOMEM;
+
+ err = ubifs_tnc_read_node(c, zbr, ino);
+ if (err) {
+ ubifs_err("cannot read inode node at LEB %d:%d, error %d",
+ zbr->lnum, zbr->offs, err);
+ kfree(ino);
+ return err;
+ }
+
+ ubifs_msg("dump of the inode %lu sitting in LEB %d:%d",
+ fscki->inum, zbr->lnum, zbr->offs);
+ dbg_dump_node(c, ino);
+ kfree(ino);
+ return -EINVAL;
+}
+
+/**
+ * dbg_check_filesystem - check the file-system.
+ * @c: UBIFS file-system description object
+ *
+ * This function checks the file system, namely:
+ * o makes sure that all leaf nodes exist and their CRCs are correct;
+ * o makes sure inode nlink, size, xattr size/count are correct (for all
+ * inodes).
+ *
+ * The function reads whole indexing tree and all nodes, so it is pretty
+ * heavy-weight. Returns zero if the file-system is consistent, %-EINVAL if
+ * not, and a negative error code in case of failure.
+ */
+int dbg_check_filesystem(struct ubifs_info *c)
+{
+ int err;
+ struct fsck_data fsckd;
+
+ if (!(ubifs_chk_flags & UBIFS_CHK_FS))
+ return 0;
+
+ fsckd.inodes = RB_ROOT;
+ err = dbg_walk_index(c, check_leaf, NULL, &fsckd);
+ if (err)
+ goto out_free;
+
+ err = check_inodes(c, &fsckd);
+ if (err)
+ goto out_free;
+
+ free_inodes(&fsckd);
+ return 0;
+
+out_free:
+ ubifs_err("file-system check failed with error %d", err);
+ dump_stack();
+ free_inodes(&fsckd);
+ return err;
+}
+
+static int invocation_cnt;
+
+int dbg_force_in_the_gaps(void)
+{
+ if (!dbg_force_in_the_gaps_enabled)
+ return 0;
+ /* Force in-the-gaps every 8th commit */
+ return !((invocation_cnt++) & 0x7);
+}
+
+/* Failure mode for recovery testing */
+
+#define chance(n, d) (simple_rand() <= (n) * 32768LL / (d))
+
+struct failure_mode_info {
+ struct list_head list;
+ struct ubifs_info *c;
+};
+
+static LIST_HEAD(fmi_list);
+static DEFINE_SPINLOCK(fmi_lock);
+
+static unsigned int next;
+
+static int simple_rand(void)
+{
+ if (next == 0)
+ next = current->pid;
+ next = next * 1103515245 + 12345;
+ return (next >> 16) & 32767;
+}
+
+void dbg_failure_mode_registration(struct ubifs_info *c)
+{
+ struct failure_mode_info *fmi;
+
+ fmi = kmalloc(sizeof(struct failure_mode_info), GFP_NOFS);
+ if (!fmi) {
+ dbg_err("Failed to register failure mode - no memory");
+ return;
+ }
+ fmi->c = c;
+ spin_lock(&fmi_lock);
+ list_add_tail(&fmi->list, &fmi_list);
+ spin_unlock(&fmi_lock);
+}
+
+void dbg_failure_mode_deregistration(struct ubifs_info *c)
+{
+ struct failure_mode_info *fmi, *tmp;
+
+ spin_lock(&fmi_lock);
+ list_for_each_entry_safe(fmi, tmp, &fmi_list, list)
+ if (fmi->c == c) {
+ list_del(&fmi->list);
+ kfree(fmi);
+ }
+ spin_unlock(&fmi_lock);
+}
+
+static struct ubifs_info *dbg_find_info(struct ubi_volume_desc *desc)
+{
+ struct failure_mode_info *fmi;
+
+ spin_lock(&fmi_lock);
+ list_for_each_entry(fmi, &fmi_list, list)
+ if (fmi->c->ubi == desc) {
+ struct ubifs_info *c = fmi->c;
+
+ spin_unlock(&fmi_lock);
+ return c;
+ }
+ spin_unlock(&fmi_lock);
+ return NULL;
+}
+
+static int in_failure_mode(struct ubi_volume_desc *desc)
+{
+ struct ubifs_info *c = dbg_find_info(desc);
+
+ if (c && dbg_failure_mode)
+ return c->failure_mode;
+ return 0;
+}
+
+static int do_fail(struct ubi_volume_desc *desc, int lnum, int write)
+{
+ struct ubifs_info *c = dbg_find_info(desc);
+
+ if (!c || !dbg_failure_mode)
+ return 0;
+ if (c->failure_mode)
+ return 1;
+ if (!c->fail_cnt) {
+ /* First call - decide delay to failure */
+ if (chance(1, 2)) {
+ unsigned int delay = 1 << (simple_rand() >> 11);
+
+ if (chance(1, 2)) {
+ c->fail_delay = 1;
+ c->fail_timeout = jiffies +
+ msecs_to_jiffies(delay);
+ dbg_rcvry("failing after %ums", delay);
+ } else {
+ c->fail_delay = 2;
+ c->fail_cnt_max = delay;
+ dbg_rcvry("failing after %u calls", delay);
+ }
+ }
+ c->fail_cnt += 1;
+ }
+ /* Determine if failure delay has expired */
+ if (c->fail_delay == 1) {
+ if (time_before(jiffies, c->fail_timeout))
+ return 0;
+ } else if (c->fail_delay == 2)
+ if (c->fail_cnt++ < c->fail_cnt_max)
+ return 0;
+ if (lnum == UBIFS_SB_LNUM) {
+ if (write) {
+ if (chance(1, 2))
+ return 0;
+ } else if (chance(19, 20))
+ return 0;
+ dbg_rcvry("failing in super block LEB %d", lnum);
+ } else if (lnum == UBIFS_MST_LNUM || lnum == UBIFS_MST_LNUM + 1) {
+ if (chance(19, 20))
+ return 0;
+ dbg_rcvry("failing in master LEB %d", lnum);
+ } else if (lnum >= UBIFS_LOG_LNUM && lnum <= c->log_last) {
+ if (write) {
+ if (chance(99, 100))
+ return 0;
+ } else if (chance(399, 400))
+ return 0;
+ dbg_rcvry("failing in log LEB %d", lnum);
+ } else if (lnum >= c->lpt_first && lnum <= c->lpt_last) {
+ if (write) {
+ if (chance(7, 8))
+ return 0;
+ } else if (chance(19, 20))
+ return 0;
+ dbg_rcvry("failing in LPT LEB %d", lnum);
+ } else if (lnum >= c->orph_first && lnum <= c->orph_last) {
+ if (write) {
+ if (chance(1, 2))
+ return 0;
+ } else if (chance(9, 10))
+ return 0;
+ dbg_rcvry("failing in orphan LEB %d", lnum);
+ } else if (lnum == c->ihead_lnum) {
+ if (chance(99, 100))
+ return 0;
+ dbg_rcvry("failing in index head LEB %d", lnum);
+ } else if (c->jheads && lnum == c->jheads[GCHD].wbuf.lnum) {
+ if (chance(9, 10))
+ return 0;
+ dbg_rcvry("failing in GC head LEB %d", lnum);
+ } else if (write && !RB_EMPTY_ROOT(&c->buds) &&
+ !ubifs_search_bud(c, lnum)) {
+ if (chance(19, 20))
+ return 0;
+ dbg_rcvry("failing in non-bud LEB %d", lnum);
+ } else if (c->cmt_state == COMMIT_RUNNING_BACKGROUND ||
+ c->cmt_state == COMMIT_RUNNING_REQUIRED) {
+ if (chance(999, 1000))
+ return 0;
+ dbg_rcvry("failing in bud LEB %d commit running", lnum);
+ } else {
+ if (chance(9999, 10000))
+ return 0;
+ dbg_rcvry("failing in bud LEB %d commit not running", lnum);
+ }
+ ubifs_err("*** SETTING FAILURE MODE ON (LEB %d) ***", lnum);
+ c->failure_mode = 1;
+ dump_stack();
+ return 1;
+}
+
+static void cut_data(const void *buf, int len)
+{
+ int flen, i;
+ unsigned char *p = (void *)buf;
+
+ flen = (len * (long long)simple_rand()) >> 15;
+ for (i = flen; i < len; i++)
+ p[i] = 0xff;
+}
+
+int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
+ int len, int check)
+{
+ if (in_failure_mode(desc))
+ return -EIO;
+ return ubi_leb_read(desc, lnum, buf, offset, len, check);
+}
+
+int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
+ int offset, int len, int dtype)
+{
+ int err, failing;
+
+ if (in_failure_mode(desc))
+ return -EIO;
+ failing = do_fail(desc, lnum, 1);
+ if (failing)
+ cut_data(buf, len);
+ err = ubi_leb_write(desc, lnum, buf, offset, len, dtype);
+ if (err)
+ return err;
+ if (failing)
+ return -EIO;
+ return 0;
+}
+
+int dbg_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf,
+ int len, int dtype)
+{
+ int err;
+
+ if (do_fail(desc, lnum, 1))
+ return -EIO;
+ err = ubi_leb_change(desc, lnum, buf, len, dtype);
+ if (err)
+ return err;
+ if (do_fail(desc, lnum, 1))
+ return -EIO;
+ return 0;
+}
+
+int dbg_leb_erase(struct ubi_volume_desc *desc, int lnum)
+{
+ int err;
+
+ if (do_fail(desc, lnum, 0))
+ return -EIO;
+ err = ubi_leb_erase(desc, lnum);
+ if (err)
+ return err;
+ if (do_fail(desc, lnum, 0))
+ return -EIO;
+ return 0;
+}
+
+int dbg_leb_unmap(struct ubi_volume_desc *desc, int lnum)
+{
+ int err;
+
+ if (do_fail(desc, lnum, 0))
+ return -EIO;
+ err = ubi_leb_unmap(desc, lnum);
+ if (err)
+ return err;
+ if (do_fail(desc, lnum, 0))
+ return -EIO;
+ return 0;
+}
+
+int dbg_is_mapped(struct ubi_volume_desc *desc, int lnum)
+{
+ if (in_failure_mode(desc))
+ return -EIO;
+ return ubi_is_mapped(desc, lnum);
+}
+
+int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype)
+{
+ int err;
+
+ if (do_fail(desc, lnum, 0))
+ return -EIO;
+ err = ubi_leb_map(desc, lnum, dtype);
+ if (err)
+ return err;
+ if (do_fail(desc, lnum, 0))
+ return -EIO;
+ return 0;
+}
+
+#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
new file mode 100644
index 00000000000..50315fc5718
--- /dev/null
+++ b/fs/ubifs/debug.h
@@ -0,0 +1,398 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ * Adrian Hunter
+ */
+
+#ifndef __UBIFS_DEBUG_H__
+#define __UBIFS_DEBUG_H__
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+
+#define UBIFS_DBG(op) op
+
+#define ubifs_assert(expr) do { \
+ if (unlikely(!(expr))) { \
+ printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \
+ __func__, __LINE__, current->pid); \
+ dbg_dump_stack(); \
+ } \
+} while (0)
+
+#define ubifs_assert_cmt_locked(c) do { \
+ if (unlikely(down_write_trylock(&(c)->commit_sem))) { \
+ up_write(&(c)->commit_sem); \
+ printk(KERN_CRIT "commit lock is not locked!\n"); \
+ ubifs_assert(0); \
+ } \
+} while (0)
+
+#define dbg_dump_stack() do { \
+ if (!dbg_failure_mode) \
+ dump_stack(); \
+} while (0)
+
+/* Generic debugging messages */
+#define dbg_msg(fmt, ...) do { \
+ spin_lock(&dbg_lock); \
+ printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", current->pid, \
+ __func__, ##__VA_ARGS__); \
+ spin_unlock(&dbg_lock); \
+} while (0)
+
+#define dbg_do_msg(typ, fmt, ...) do { \
+ if (ubifs_msg_flags & typ) \
+ dbg_msg(fmt, ##__VA_ARGS__); \
+} while (0)
+
+#define dbg_err(fmt, ...) do { \
+ spin_lock(&dbg_lock); \
+ ubifs_err(fmt, ##__VA_ARGS__); \
+ spin_unlock(&dbg_lock); \
+} while (0)
+
+const char *dbg_key_str0(const struct ubifs_info *c,
+ const union ubifs_key *key);
+const char *dbg_key_str1(const struct ubifs_info *c,
+ const union ubifs_key *key);
+
+/*
+ * DBGKEY macros require @dbg_lock to be held, which it is in the dbg message
+ * macros.
+ */
+#define DBGKEY(key) dbg_key_str0(c, (key))
+#define DBGKEY1(key) dbg_key_str1(c, (key))
+
+/* General messages */
+#define dbg_gen(fmt, ...) dbg_do_msg(UBIFS_MSG_GEN, fmt, ##__VA_ARGS__)
+
+/* Additional journal messages */
+#define dbg_jnl(fmt, ...) dbg_do_msg(UBIFS_MSG_JNL, fmt, ##__VA_ARGS__)
+
+/* Additional TNC messages */
+#define dbg_tnc(fmt, ...) dbg_do_msg(UBIFS_MSG_TNC, fmt, ##__VA_ARGS__)
+
+/* Additional lprops messages */
+#define dbg_lp(fmt, ...) dbg_do_msg(UBIFS_MSG_LP, fmt, ##__VA_ARGS__)
+
+/* Additional LEB find messages */
+#define dbg_find(fmt, ...) dbg_do_msg(UBIFS_MSG_FIND, fmt, ##__VA_ARGS__)
+
+/* Additional mount messages */
+#define dbg_mnt(fmt, ...) dbg_do_msg(UBIFS_MSG_MNT, fmt, ##__VA_ARGS__)
+
+/* Additional I/O messages */
+#define dbg_io(fmt, ...) dbg_do_msg(UBIFS_MSG_IO, fmt, ##__VA_ARGS__)
+
+/* Additional commit messages */
+#define dbg_cmt(fmt, ...) dbg_do_msg(UBIFS_MSG_CMT, fmt, ##__VA_ARGS__)
+
+/* Additional budgeting messages */
+#define dbg_budg(fmt, ...) dbg_do_msg(UBIFS_MSG_BUDG, fmt, ##__VA_ARGS__)
+
+/* Additional log messages */
+#define dbg_log(fmt, ...) dbg_do_msg(UBIFS_MSG_LOG, fmt, ##__VA_ARGS__)
+
+/* Additional gc messages */
+#define dbg_gc(fmt, ...) dbg_do_msg(UBIFS_MSG_GC, fmt, ##__VA_ARGS__)
+
+/* Additional scan messages */
+#define dbg_scan(fmt, ...) dbg_do_msg(UBIFS_MSG_SCAN, fmt, ##__VA_ARGS__)
+
+/* Additional recovery messages */
+#define dbg_rcvry(fmt, ...) dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__)
+
+/*
+ * Debugging message type flags (must match msg_type_names in debug.c).
+ *
+ * UBIFS_MSG_GEN: general messages
+ * UBIFS_MSG_JNL: journal messages
+ * UBIFS_MSG_MNT: mount messages
+ * UBIFS_MSG_CMT: commit messages
+ * UBIFS_MSG_FIND: LEB find messages
+ * UBIFS_MSG_BUDG: budgeting messages
+ * UBIFS_MSG_GC: garbage collection messages
+ * UBIFS_MSG_TNC: TNC messages
+ * UBIFS_MSG_LP: lprops messages
+ * UBIFS_MSG_IO: I/O messages
+ * UBIFS_MSG_LOG: log messages
+ * UBIFS_MSG_SCAN: scan messages
+ * UBIFS_MSG_RCVRY: recovery messages
+ */
+enum {
+ UBIFS_MSG_GEN = 0x1,
+ UBIFS_MSG_JNL = 0x2,
+ UBIFS_MSG_MNT = 0x4,
+ UBIFS_MSG_CMT = 0x8,
+ UBIFS_MSG_FIND = 0x10,
+ UBIFS_MSG_BUDG = 0x20,
+ UBIFS_MSG_GC = 0x40,
+ UBIFS_MSG_TNC = 0x80,
+ UBIFS_MSG_LP = 0x100,
+ UBIFS_MSG_IO = 0x200,
+ UBIFS_MSG_LOG = 0x400,
+ UBIFS_MSG_SCAN = 0x800,
+ UBIFS_MSG_RCVRY = 0x1000,
+};
+
+/* Debugging message type flags for each default debug message level */
+#define UBIFS_MSG_LVL_0 0
+#define UBIFS_MSG_LVL_1 0x1
+#define UBIFS_MSG_LVL_2 0x7f
+#define UBIFS_MSG_LVL_3 0xffff
+
+/*
+ * Debugging check flags (must match chk_names in debug.c).
+ *
+ * UBIFS_CHK_GEN: general checks
+ * UBIFS_CHK_TNC: check TNC
+ * UBIFS_CHK_IDX_SZ: check index size
+ * UBIFS_CHK_ORPH: check orphans
+ * UBIFS_CHK_OLD_IDX: check the old index
+ * UBIFS_CHK_LPROPS: check lprops
+ * UBIFS_CHK_FS: check the file-system
+ */
+enum {
+ UBIFS_CHK_GEN = 0x1,
+ UBIFS_CHK_TNC = 0x2,
+ UBIFS_CHK_IDX_SZ = 0x4,
+ UBIFS_CHK_ORPH = 0x8,
+ UBIFS_CHK_OLD_IDX = 0x10,
+ UBIFS_CHK_LPROPS = 0x20,
+ UBIFS_CHK_FS = 0x40,
+};
+
+/*
+ * Special testing flags (must match tst_names in debug.c).
+ *
+ * UBIFS_TST_FORCE_IN_THE_GAPS: force the use of in-the-gaps method
+ * UBIFS_TST_RCVRY: failure mode for recovery testing
+ */
+enum {
+ UBIFS_TST_FORCE_IN_THE_GAPS = 0x2,
+ UBIFS_TST_RCVRY = 0x4,
+};
+
+#if CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 1
+#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_1
+#elif CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 2
+#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_2
+#elif CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 3
+#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_3
+#else
+#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_0
+#endif
+
+#ifdef CONFIG_UBIFS_FS_DEBUG_CHKS
+#define UBIFS_CHK_FLAGS_DEFAULT 0xffffffff
+#else
+#define UBIFS_CHK_FLAGS_DEFAULT 0
+#endif
+
+extern spinlock_t dbg_lock;
+
+extern unsigned int ubifs_msg_flags;
+extern unsigned int ubifs_chk_flags;
+extern unsigned int ubifs_tst_flags;
+
+/* Dump functions */
+
+const char *dbg_ntype(int type);
+const char *dbg_cstate(int cmt_state);
+const char *dbg_get_key_dump(const struct ubifs_info *c,
+ const union ubifs_key *key);
+void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode);
+void dbg_dump_node(const struct ubifs_info *c, const void *node);
+void dbg_dump_budget_req(const struct ubifs_budget_req *req);
+void dbg_dump_lstats(const struct ubifs_lp_stats *lst);
+void dbg_dump_budg(struct ubifs_info *c);
+void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp);
+void dbg_dump_lprops(struct ubifs_info *c);
+void dbg_dump_leb(const struct ubifs_info *c, int lnum);
+void dbg_dump_znode(const struct ubifs_info *c,
+ const struct ubifs_znode *znode);
+void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat);
+void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
+ struct ubifs_nnode *parent, int iip);
+void dbg_dump_tnc(struct ubifs_info *c);
+void dbg_dump_index(struct ubifs_info *c);
+
+/* Checking helper functions */
+
+typedef int (*dbg_leaf_callback)(struct ubifs_info *c,
+ struct ubifs_zbranch *zbr, void *priv);
+typedef int (*dbg_znode_callback)(struct ubifs_info *c,
+ struct ubifs_znode *znode, void *priv);
+int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb,
+ dbg_znode_callback znode_cb, void *priv);
+
+/* Checking functions */
+
+int dbg_check_lprops(struct ubifs_info *c);
+int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot);
+int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot);
+int dbg_check_cats(struct ubifs_info *c);
+int dbg_check_ltab(struct ubifs_info *c);
+int dbg_check_synced_i_size(struct inode *inode);
+int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir);
+int dbg_check_tnc(struct ubifs_info *c, int extra);
+int dbg_check_idx_size(struct ubifs_info *c, long long idx_size);
+int dbg_check_filesystem(struct ubifs_info *c);
+void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat,
+ int add_pos);
+int dbg_check_lprops(struct ubifs_info *c);
+int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
+ int row, int col);
+
+/* Force the use of in-the-gaps method for testing */
+
+#define dbg_force_in_the_gaps_enabled \
+ (ubifs_tst_flags & UBIFS_TST_FORCE_IN_THE_GAPS)
+
+int dbg_force_in_the_gaps(void);
+
+/* Failure mode for recovery testing */
+
+#define dbg_failure_mode (ubifs_tst_flags & UBIFS_TST_RCVRY)
+
+void dbg_failure_mode_registration(struct ubifs_info *c);
+void dbg_failure_mode_deregistration(struct ubifs_info *c);
+
+#ifndef UBIFS_DBG_PRESERVE_UBI
+
+#define ubi_leb_read dbg_leb_read
+#define ubi_leb_write dbg_leb_write
+#define ubi_leb_change dbg_leb_change
+#define ubi_leb_erase dbg_leb_erase
+#define ubi_leb_unmap dbg_leb_unmap
+#define ubi_is_mapped dbg_is_mapped
+#define ubi_leb_map dbg_leb_map
+
+#endif
+
+int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
+ int len, int check);
+int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
+ int offset, int len, int dtype);
+int dbg_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf,
+ int len, int dtype);
+int dbg_leb_erase(struct ubi_volume_desc *desc, int lnum);
+int dbg_leb_unmap(struct ubi_volume_desc *desc, int lnum);
+int dbg_is_mapped(struct ubi_volume_desc *desc, int lnum);
+int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype);
+
+static inline int dbg_read(struct ubi_volume_desc *desc, int lnum, char *buf,
+ int offset, int len)
+{
+ return dbg_leb_read(desc, lnum, buf, offset, len, 0);
+}
+
+static inline int dbg_write(struct ubi_volume_desc *desc, int lnum,
+ const void *buf, int offset, int len)
+{
+ return dbg_leb_write(desc, lnum, buf, offset, len, UBI_UNKNOWN);
+}
+
+static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,
+ const void *buf, int len)
+{
+ return dbg_leb_change(desc, lnum, buf, len, UBI_UNKNOWN);
+}
+
+#else /* !CONFIG_UBIFS_FS_DEBUG */
+
+#define UBIFS_DBG(op)
+
+/* Use "if (0)" to make compiler check arguments even if debugging is off */
+#define ubifs_assert(expr) do { \
+ if (0 && (expr)) \
+ printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \
+ __func__, __LINE__, current->pid); \
+} while (0)
+
+#define dbg_err(fmt, ...) do { \
+ if (0) \
+ ubifs_err(fmt, ##__VA_ARGS__); \
+} while (0)
+
+#define dbg_msg(fmt, ...) do { \
+ if (0) \
+ printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", \
+ current->pid, __func__, ##__VA_ARGS__); \
+} while (0)
+
+#define dbg_dump_stack()
+#define ubifs_assert_cmt_locked(c)
+
+#define dbg_gen(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_jnl(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_tnc(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_lp(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_find(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_mnt(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_io(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_cmt(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_budg(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_log(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_gc(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_scan(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
+#define dbg_rcvry(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__)
+
+#define DBGKEY(key) ((char *)(key))
+#define DBGKEY1(key) ((char *)(key))
+
+#define dbg_ntype(type) ""
+#define dbg_cstate(cmt_state) ""
+#define dbg_get_key_dump(c, key) ({})
+#define dbg_dump_inode(c, inode) ({})
+#define dbg_dump_node(c, node) ({})
+#define dbg_dump_budget_req(req) ({})
+#define dbg_dump_lstats(lst) ({})
+#define dbg_dump_budg(c) ({})
+#define dbg_dump_lprop(c, lp) ({})
+#define dbg_dump_lprops(c) ({})
+#define dbg_dump_leb(c, lnum) ({})
+#define dbg_dump_znode(c, znode) ({})
+#define dbg_dump_heap(c, heap, cat) ({})
+#define dbg_dump_pnode(c, pnode, parent, iip) ({})
+#define dbg_dump_tnc(c) ({})
+#define dbg_dump_index(c) ({})
+
+#define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0
+#define dbg_old_index_check_init(c, zroot) 0
+#define dbg_check_old_index(c, zroot) 0
+#define dbg_check_cats(c) 0
+#define dbg_check_ltab(c) 0
+#define dbg_check_synced_i_size(inode) 0
+#define dbg_check_dir_size(c, dir) 0
+#define dbg_check_tnc(c, x) 0
+#define dbg_check_idx_size(c, idx_size) 0
+#define dbg_check_filesystem(c) 0
+#define dbg_check_heap(c, heap, cat, add_pos) ({})
+#define dbg_check_lprops(c) 0
+#define dbg_check_lpt_nodes(c, cnode, row, col) 0
+#define dbg_force_in_the_gaps_enabled 0
+#define dbg_force_in_the_gaps() 0
+#define dbg_failure_mode 0
+#define dbg_failure_mode_registration(c) ({})
+#define dbg_failure_mode_deregistration(c) ({})
+
+#endif /* !CONFIG_UBIFS_FS_DEBUG */
+
+#endif /* !__UBIFS_DEBUG_H__ */
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
new file mode 100644
index 00000000000..526c01ec800
--- /dev/null
+++ b/fs/ubifs/dir.c
@@ -0,0 +1,1231 @@
+/* * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ * Copyright (C) 2006, 2007 University of Szeged, Hungary
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ * Adrian Hunter
+ * Zoltan Sogor
+ */
+
+/*
+ * This file implements directory operations.
+ *
+ * All FS operations in this file allocate budget before writing anything to the
+ * media. If they fail to allocate it, the error is returned. The only
+ * exceptions are 'ubifs_unlink()' and 'ubifs_rmdir()' which keep working even
+ * if they unable to allocate the budget, because deletion %-ENOSPC failure is
+ * not what users are usually ready to get. UBIFS budgeting subsystem has some
+ * space reserved for these purposes.
+ *
+ * All operations in this file write all inodes which they change straight
+ * away, instead of marking them dirty. For example, 'ubifs_link()' changes
+ * @i_size of the parent inode and writes the parent inode together with the
+ * target inode. This was done to simplify file-system recovery which would
+ * otherwise be very difficult to do. The only exception is rename which marks
+ * the re-named inode dirty (because its @i_ctime is updated) but does not
+ * write it, but just marks it as dirty.
+ */
+
+#include "ubifs.h"
+
+/**
+ * inherit_flags - inherit flags of the parent inode.
+ * @dir: parent inode
+ * @mode: new inode mode flags
+ *
+ * This is a helper function for 'ubifs_new_inode()' which inherits flag of the
+ * parent directory inode @dir. UBIFS inodes inherit the following flags:
+ * o %UBIFS_COMPR_FL, which is useful to switch compression on/of on
+ * sub-directory basis;
+ * o %UBIFS_SYNC_FL - useful for the same reasons;
+ * o %UBIFS_DIRSYNC_FL - similar, but relevant only to directories.
+ *
+ * This function returns the inherited flags.
+ */
+static int inherit_flags(const struct inode *dir, int mode)
+{
+ int flags;
+ const struct ubifs_inode *ui = ubifs_inode(dir);
+
+ if (!S_ISDIR(dir->i_mode))
+ /*
+ * The parent is not a directory, which means that an extended
+ * attribute inode is being created. No flags.
+ */
+ return 0;
+
+ flags = ui->flags & (UBIFS_COMPR_FL | UBIFS_SYNC_FL | UBIFS_DIRSYNC_FL);
+ if (!S_ISDIR(mode))
+ /* The "DIRSYNC" flag only applies to directories */
+ flags &= ~UBIFS_DIRSYNC_FL;
+ return flags;
+}
+
+/**
+ * ubifs_new_inode - allocate new UBIFS inode object.
+ * @c: UBIFS file-system description object
+ * @dir: parent directory inode
+ * @mode: inode mode flags
+ *
+ * This function finds an unused inode number, allocates new inode and
+ * initializes it. Returns new inode in case of success and an error code in
+ * case of failure.
+ */
+struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
+ int mode)
+{
+ struct inode *inode;
+ struct ubifs_inode *ui;
+
+ inode = new_inode(c->vfs_sb);
+ ui = ubifs_inode(inode);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Set 'S_NOCMTIME' to prevent VFS form updating [mc]time of inodes and
+ * marking them dirty in file write path (see 'file_update_time()').
+ * UBIFS has to fully control "clean <-> dirty" transitions of inodes
+ * to make budgeting work.
+ */
+ inode->i_flags |= (S_NOCMTIME);
+
+ inode->i_uid = current->fsuid;
+ if (dir->i_mode & S_ISGID) {
+ inode->i_gid = dir->i_gid;
+ if (S_ISDIR(mode))
+ mode |= S_ISGID;
+ } else
+ inode->i_gid = current->fsgid;
+ inode->i_mode = mode;
+ inode->i_mtime = inode->i_atime = inode->i_ctime =
+ ubifs_current_time(inode);
+ inode->i_mapping->nrpages = 0;
+ /* Disable readahead */
+ inode->i_mapping->backing_dev_info = &c->bdi;
+
+ switch (mode & S_IFMT) {
+ case S_IFREG:
+ inode->i_mapping->a_ops = &ubifs_file_address_operations;
+ inode->i_op = &ubifs_file_inode_operations;
+ inode->i_fop = &ubifs_file_operations;
+ break;
+ case S_IFDIR:
+ inode->i_op = &ubifs_dir_inode_operations;
+ inode->i_fop = &ubifs_dir_operations;
+ inode->i_size = ui->ui_size = UBIFS_INO_NODE_SZ;
+ break;
+ case S_IFLNK:
+ inode->i_op = &ubifs_symlink_inode_operations;
+ break;
+ case S_IFSOCK:
+ case S_IFIFO:
+ case S_IFBLK:
+ case S_IFCHR:
+ inode->i_op = &ubifs_file_inode_operations;
+ break;
+ default:
+ BUG();
+ }
+
+ ui->flags = inherit_flags(dir, mode);
+ ubifs_set_inode_flags(inode);
+ if (S_ISREG(mode))
+ ui->compr_type = c->default_compr;
+ else
+ ui->compr_type = UBIFS_COMPR_NONE;
+ ui->synced_i_size = 0;
+
+ spin_lock(&c->cnt_lock);
+ /* Inode number overflow is currently not supported */
+ if (c->highest_inum >= INUM_WARN_WATERMARK) {
+ if (c->highest_inum >= INUM_WATERMARK) {
+ spin_unlock(&c->cnt_lock);
+ ubifs_err("out of inode numbers");
+ make_bad_inode(inode);
+ iput(inode);
+ return ERR_PTR(-EINVAL);
+ }
+ ubifs_warn("running out of inode numbers (current %lu, max %d)",
+ c->highest_inum, INUM_WATERMARK);
+ }
+
+ inode->i_ino = ++c->highest_inum;
+ /*
+ * The creation sequence number remains with this inode for its
+ * lifetime. All nodes for this inode have a greater sequence number,
+ * and so it is possible to distinguish obsolete nodes belonging to a
+ * previous incarnation of the same inode number - for example, for the
+ * purpose of rebuilding the index.
+ */
+ ui->creat_sqnum = ++c->max_sqnum;
+ spin_unlock(&c->cnt_lock);
+ return inode;
+}
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+
+static int dbg_check_name(struct ubifs_dent_node *dent, struct qstr *nm)
+{
+ if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+ return 0;
+ if (le16_to_cpu(dent->nlen) != nm->len)
+ return -EINVAL;
+ if (memcmp(dent->name, nm->name, nm->len))
+ return -EINVAL;
+ return 0;
+}
+
+#else
+
+#define dbg_check_name(dent, nm) 0
+
+#endif
+
+static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
+ struct nameidata *nd)
+{
+ int err;
+ union ubifs_key key;
+ struct inode *inode = NULL;
+ struct ubifs_dent_node *dent;
+ struct ubifs_info *c = dir->i_sb->s_fs_info;
+
+ dbg_gen("'%.*s' in dir ino %lu",
+ dentry->d_name.len, dentry->d_name.name, dir->i_ino);
+
+ if (dentry->d_name.len > UBIFS_MAX_NLEN)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ dent = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
+ if (!dent)
+ return ERR_PTR(-ENOMEM);
+
+ dent_key_init(c, &key, dir->i_ino, &dentry->d_name);
+
+ err = ubifs_tnc_lookup_nm(c, &key, dent, &dentry->d_name);
+ if (err) {
+ if (err == -ENOENT) {
+ dbg_gen("not found");
+ goto done;
+ }
+ goto out;
+ }
+
+ if (dbg_check_name(dent, &dentry->d_name)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ inode = ubifs_iget(dir->i_sb, le64_to_cpu(dent->inum));
+ if (IS_ERR(inode)) {
+ /*
+ * This should not happen. Probably the file-system needs
+ * checking.
+ */
+ err = PTR_ERR(inode);
+ ubifs_err("dead directory entry '%.*s', error %d",
+ dentry->d_name.len, dentry->d_name.name, err);
+ ubifs_ro_mode(c, err);
+ goto out;
+ }
+
+done:
+ kfree(dent);
+ /*
+ * Note, d_splice_alias() would be required instead if we supported
+ * NFS.
+ */
+ d_add(dentry, inode);
+ return NULL;
+
+out:
+ kfree(dent);
+ return ERR_PTR(err);
+}
+
+static int ubifs_create(struct inode *dir, struct dentry *dentry, int mode,
+ struct nameidata *nd)
+{
+ struct inode *inode;
+ struct ubifs_info *c = dir->i_sb->s_fs_info;
+ int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+ struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
+ .dirtied_ino = 1 };
+ struct ubifs_inode *dir_ui = ubifs_inode(dir);
+
+ /*
+ * Budget request settings: new inode, new direntry, changing the
+ * parent directory inode.
+ */
+
+ dbg_gen("dent '%.*s', mode %#x in dir ino %lu",
+ dentry->d_name.len, dentry->d_name.name, mode, dir->i_ino);
+
+ err = ubifs_budget_space(c, &req);
+ if (err)
+ return err;
+
+ inode = ubifs_new_inode(c, dir, mode);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto out_budg;
+ }
+
+ mutex_lock(&dir_ui->ui_mutex);
+ dir->i_size += sz_change;
+ dir_ui->ui_size = dir->i_size;
+ dir->i_mtime = dir->i_ctime = inode->i_ctime;
+ err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+ if (err)
+ goto out_cancel;
+ mutex_unlock(&dir_ui->ui_mutex);
+
+ ubifs_release_budget(c, &req);
+ insert_inode_hash(inode);
+ d_instantiate(dentry, inode);
+ return 0;
+
+out_cancel:
+ dir->i_size -= sz_change;
+ dir_ui->ui_size = dir->i_size;
+ mutex_unlock(&dir_ui->ui_mutex);
+ make_bad_inode(inode);
+ iput(inode);
+out_budg:
+ ubifs_release_budget(c, &req);
+ ubifs_err("cannot create regular file, error %d", err);
+ return err;
+}
+
+/**
+ * vfs_dent_type - get VFS directory entry type.
+ * @type: UBIFS directory entry type
+ *
+ * This function converts UBIFS directory entry type into VFS directory entry
+ * type.
+ */
+static unsigned int vfs_dent_type(uint8_t type)
+{
+ switch (type) {
+ case UBIFS_ITYPE_REG:
+ return DT_REG;
+ case UBIFS_ITYPE_DIR:
+ return DT_DIR;
+ case UBIFS_ITYPE_LNK:
+ return DT_LNK;
+ case UBIFS_ITYPE_BLK:
+ return DT_BLK;
+ case UBIFS_ITYPE_CHR:
+ return DT_CHR;
+ case UBIFS_ITYPE_FIFO:
+ return DT_FIFO;
+ case UBIFS_ITYPE_SOCK:
+ return DT_SOCK;
+ default:
+ BUG();
+ }
+ return 0;
+}
+
+/*
+ * The classical Unix view for directory is that it is a linear array of
+ * (name, inode number) entries. Linux/VFS assumes this model as well.
+ * Particularly, 'readdir()' call wants us to return a directory entry offset
+ * which later may be used to continue 'readdir()'ing the directory or to
+ * 'seek()' to that specific direntry. Obviously UBIFS does not really fit this
+ * model because directory entries are identified by keys, which may collide.
+ *
+ * UBIFS uses directory entry hash value for directory offsets, so
+ * 'seekdir()'/'telldir()' may not always work because of possible key
+ * collisions. But UBIFS guarantees that consecutive 'readdir()' calls work
+ * properly by means of saving full directory entry name in the private field
+ * of the file description object.
+ *
+ * This means that UBIFS cannot support NFS which requires full
+ * 'seekdir()'/'telldir()' support.
+ */
+static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
+{
+ int err, over = 0;
+ struct qstr nm;
+ union ubifs_key key;
+ struct ubifs_dent_node *dent;
+ struct inode *dir = file->f_path.dentry->d_inode;
+ struct ubifs_info *c = dir->i_sb->s_fs_info;
+
+ dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, file->f_pos);
+
+ if (file->f_pos > UBIFS_S_KEY_HASH_MASK || file->f_pos == 2)
+ /*
+ * The directory was seek'ed to a senseless position or there
+ * are no more entries.
+ */
+ return 0;
+
+ /* File positions 0 and 1 correspond to "." and ".." */
+ if (file->f_pos == 0) {
+ ubifs_assert(!file->private_data);
+ over = filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR);
+ if (over)
+ return 0;
+ file->f_pos = 1;
+ }
+
+ if (file->f_pos == 1) {
+ ubifs_assert(!file->private_data);
+ over = filldir(dirent, "..", 2, 1,
+ parent_ino(file->f_path.dentry), DT_DIR);
+ if (over)
+ return 0;
+
+ /* Find the first entry in TNC and save it */
+ lowest_dent_key(c, &key, dir->i_ino);
+ nm.name = NULL;
+ dent = ubifs_tnc_next_ent(c, &key, &nm);
+ if (IS_ERR(dent)) {
+ err = PTR_ERR(dent);
+ goto out;
+ }
+
+ file->f_pos = key_hash_flash(c, &dent->key);
+ file->private_data = dent;
+ }
+
+ dent = file->private_data;
+ if (!dent) {
+ /*
+ * The directory was seek'ed to and is now readdir'ed.
+ * Find the entry corresponding to @file->f_pos or the
+ * closest one.
+ */
+ dent_key_init_hash(c, &key, dir->i_ino, file->f_pos);
+ nm.name = NULL;
+ dent = ubifs_tnc_next_ent(c, &key, &nm);
+ if (IS_ERR(dent)) {
+ err = PTR_ERR(dent);
+ goto out;
+ }
+ file->f_pos = key_hash_flash(c, &dent->key);
+ file->private_data = dent;
+ }
+
+ while (1) {
+ dbg_gen("feed '%s', ino %llu, new f_pos %#x",
+ dent->name, (unsigned long long)le64_to_cpu(dent->inum),
+ key_hash_flash(c, &dent->key));
+ ubifs_assert(dent->ch.sqnum > ubifs_inode(dir)->creat_sqnum);
+
+ nm.len = le16_to_cpu(dent->nlen);
+ over = filldir(dirent, dent->name, nm.len, file->f_pos,
+ le64_to_cpu(dent->inum),
+ vfs_dent_type(dent->type));
+ if (over)
+ return 0;
+
+ /* Switch to the next entry */
+ key_read(c, &dent->key, &key);
+ nm.name = dent->name;
+ dent = ubifs_tnc_next_ent(c, &key, &nm);
+ if (IS_ERR(dent)) {
+ err = PTR_ERR(dent);
+ goto out;
+ }
+
+ kfree(file->private_data);
+ file->f_pos = key_hash_flash(c, &dent->key);
+ file->private_data = dent;
+ cond_resched();
+ }
+
+out:
+ if (err != -ENOENT) {
+ ubifs_err("cannot find next direntry, error %d", err);
+ return err;
+ }
+
+ kfree(file->private_data);
+ file->private_data = NULL;
+ file->f_pos = 2;
+ return 0;
+}
+
+/* If a directory is seeked, we have to free saved readdir() state */
+static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int origin)
+{
+ kfree(file->private_data);
+ file->private_data = NULL;
+ return generic_file_llseek(file, offset, origin);
+}
+
+/* Free saved readdir() state when the directory is closed */
+static int ubifs_dir_release(struct inode *dir, struct file *file)
+{
+ kfree(file->private_data);
+ file->private_data = NULL;
+ return 0;
+}
+
+/**
+ * lock_2_inodes - lock two UBIFS inodes.
+ * @inode1: first inode
+ * @inode2: second inode
+ */
+static void lock_2_inodes(struct inode *inode1, struct inode *inode2)
+{
+ if (inode1->i_ino < inode2->i_ino) {
+ mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_2);
+ mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_3);
+ } else {
+ mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
+ mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_3);
+ }
+}
+
+/**
+ * unlock_2_inodes - unlock two UBIFS inodes inodes.
+ * @inode1: first inode
+ * @inode2: second inode
+ */
+static void unlock_2_inodes(struct inode *inode1, struct inode *inode2)
+{
+ mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
+ mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
+}
+
+static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *dentry)
+{
+ struct ubifs_info *c = dir->i_sb->s_fs_info;
+ struct inode *inode = old_dentry->d_inode;
+ struct ubifs_inode *ui = ubifs_inode(inode);
+ struct ubifs_inode *dir_ui = ubifs_inode(dir);
+ int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+ struct ubifs_budget_req req = { .new_dent = 1, .dirtied_ino = 2,
+ .dirtied_ino_d = ALIGN(ui->data_len, 8) };
+
+ /*
+ * Budget request settings: new direntry, changing the target inode,
+ * changing the parent inode.
+ */
+
+ dbg_gen("dent '%.*s' to ino %lu (nlink %d) in dir ino %lu",
+ dentry->d_name.len, dentry->d_name.name, inode->i_ino,
+ inode->i_nlink, dir->i_ino);
+ err = dbg_check_synced_i_size(inode);
+ if (err)
+ return err;
+
+ err = ubifs_budget_space(c, &req);
+ if (err)
+ return err;
+
+ lock_2_inodes(dir, inode);
+ inc_nlink(inode);
+ atomic_inc(&inode->i_count);
+ inode->i_ctime = ubifs_current_time(inode);
+ dir->i_size += sz_change;
+ dir_ui->ui_size = dir->i_size;
+ dir->i_mtime = dir->i_ctime = inode->i_ctime;
+ err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+ if (err)
+ goto out_cancel;
+ unlock_2_inodes(dir, inode);
+
+ ubifs_release_budget(c, &req);
+ d_instantiate(dentry, inode);
+ return 0;
+
+out_cancel:
+ dir->i_size -= sz_change;
+ dir_ui->ui_size = dir->i_size;
+ drop_nlink(inode);
+ unlock_2_inodes(dir, inode);
+ ubifs_release_budget(c, &req);
+ iput(inode);
+ return err;
+}
+
+static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct ubifs_info *c = dir->i_sb->s_fs_info;
+ struct inode *inode = dentry->d_inode;
+ struct ubifs_inode *dir_ui = ubifs_inode(dir);
+ int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+ int err, budgeted = 1;
+ struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
+
+ /*
+ * Budget request settings: deletion direntry, deletion inode (+1 for
+ * @dirtied_ino), changing the parent directory inode. If budgeting
+ * fails, go ahead anyway because we have extra space reserved for
+ * deletions.
+ */
+
+ dbg_gen("dent '%.*s' from ino %lu (nlink %d) in dir ino %lu",
+ dentry->d_name.len, dentry->d_name.name, inode->i_ino,
+ inode->i_nlink, dir->i_ino);
+ err = dbg_check_synced_i_size(inode);
+ if (err)
+ return err;
+
+ err = ubifs_budget_space(c, &req);
+ if (err) {
+ if (err != -ENOSPC)
+ return err;
+ budgeted = 0;
+ }
+
+ lock_2_inodes(dir, inode);
+ inode->i_ctime = ubifs_current_time(dir);
+ drop_nlink(inode);
+ dir->i_size -= sz_change;
+ dir_ui->ui_size = dir->i_size;
+ dir->i_mtime = dir->i_ctime = inode->i_ctime;
+ err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0);
+ if (err)
+ goto out_cancel;
+ unlock_2_inodes(dir, inode);
+
+ if (budgeted)
+ ubifs_release_budget(c, &req);
+ else {
+ /* We've deleted something - clean the "no space" flags */
+ c->nospace = c->nospace_rp = 0;
+ smp_wmb();
+ }
+ return 0;
+
+out_cancel:
+ dir->i_size += sz_change;
+ dir_ui->ui_size = dir->i_size;
+ inc_nlink(inode);
+ unlock_2_inodes(dir, inode);
+ if (budgeted)
+ ubifs_release_budget(c, &req);
+ return err;
+}
+
+/**
+ * check_dir_empty - check if a directory is empty or not.
+ * @c: UBIFS file-system description object
+ * @dir: VFS inode object of the directory to check
+ *
+ * This function checks if directory @dir is empty. Returns zero if the
+ * directory is empty, %-ENOTEMPTY if it is not, and other negative error codes
+ * in case of of errors.
+ */
+static int check_dir_empty(struct ubifs_info *c, struct inode *dir)
+{
+ struct qstr nm = { .name = NULL };
+ struct ubifs_dent_node *dent;
+ union ubifs_key key;
+ int err;
+
+ lowest_dent_key(c, &key, dir->i_ino);
+ dent = ubifs_tnc_next_ent(c, &key, &nm);
+ if (IS_ERR(dent)) {
+ err = PTR_ERR(dent);
+ if (err == -ENOENT)
+ err = 0;
+ } else {
+ kfree(dent);
+ err = -ENOTEMPTY;
+ }
+ return err;
+}
+
+static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ struct ubifs_info *c = dir->i_sb->s_fs_info;
+ struct inode *inode = dentry->d_inode;
+ int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+ int err, budgeted = 1;
+ struct ubifs_inode *dir_ui = ubifs_inode(dir);
+ struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
+
+ /*
+ * Budget request settings: deletion direntry, deletion inode and
+ * changing the parent inode. If budgeting fails, go ahead anyway
+ * because we have extra space reserved for deletions.
+ */
+
+ dbg_gen("directory '%.*s', ino %lu in dir ino %lu", dentry->d_name.len,
+ dentry->d_name.name, inode->i_ino, dir->i_ino);
+
+ err = check_dir_empty(c, dentry->d_inode);
+ if (err)
+ return err;
+
+ err = ubifs_budget_space(c, &req);
+ if (err) {
+ if (err != -ENOSPC)
+ return err;
+ budgeted = 0;
+ }
+
+ lock_2_inodes(dir, inode);
+ inode->i_ctime = ubifs_current_time(dir);
+ clear_nlink(inode);
+ drop_nlink(dir);
+ dir->i_size -= sz_change;
+ dir_ui->ui_size = dir->i_size;
+ dir->i_mtime = dir->i_ctime = inode->i_ctime;
+ err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0);
+ if (err)
+ goto out_cancel;
+ unlock_2_inodes(dir, inode);
+
+ if (budgeted)
+ ubifs_release_budget(c, &req);
+ else {
+ /* We've deleted something - clean the "no space" flags */
+ c->nospace = c->nospace_rp = 0;
+ smp_wmb();
+ }
+ return 0;
+
+out_cancel:
+ dir->i_size += sz_change;
+ dir_ui->ui_size = dir->i_size;
+ inc_nlink(dir);
+ inc_nlink(inode);
+ inc_nlink(inode);
+ unlock_2_inodes(dir, inode);
+ if (budgeted)
+ ubifs_release_budget(c, &req);
+ return err;
+}
+
+static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+ struct inode *inode;
+ struct ubifs_inode *dir_ui = ubifs_inode(dir);
+ struct ubifs_info *c = dir->i_sb->s_fs_info;
+ int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+ struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1 };
+
+ /*
+ * Budget request settings: new inode, new direntry and changing parent
+ * directory inode.
+ */
+
+ dbg_gen("dent '%.*s', mode %#x in dir ino %lu",
+ dentry->d_name.len, dentry->d_name.name, mode, dir->i_ino);
+
+ err = ubifs_budget_space(c, &req);
+ if (err)
+ return err;
+
+ inode = ubifs_new_inode(c, dir, S_IFDIR | mode);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto out_budg;
+ }
+
+ mutex_lock(&dir_ui->ui_mutex);
+ insert_inode_hash(inode);
+ inc_nlink(inode);
+ inc_nlink(dir);
+ dir->i_size += sz_change;
+ dir_ui->ui_size = dir->i_size;
+ dir->i_mtime = dir->i_ctime = inode->i_ctime;
+ err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+ if (err) {
+ ubifs_err("cannot create directory, error %d", err);
+ goto out_cancel;
+ }
+ mutex_unlock(&dir_ui->ui_mutex);
+
+ ubifs_release_budget(c, &req);
+ d_instantiate(dentry, inode);
+ return 0;
+
+out_cancel:
+ dir->i_size -= sz_change;
+ dir_ui->ui_size = dir->i_size;
+ drop_nlink(dir);
+ mutex_unlock(&dir_ui->ui_mutex);
+ make_bad_inode(inode);
+ iput(inode);
+out_budg:
+ ubifs_release_budget(c, &req);
+ return err;
+}
+
+static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
+ int mode, dev_t rdev)
+{
+ struct inode *inode;
+ struct ubifs_inode *ui;
+ struct ubifs_inode *dir_ui = ubifs_inode(dir);
+ struct ubifs_info *c = dir->i_sb->s_fs_info;
+ union ubifs_dev_desc *dev = NULL;
+ int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+ int err, devlen = 0;
+ struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
+ .new_ino_d = ALIGN(devlen, 8),
+ .dirtied_ino = 1 };
+
+ /*
+ * Budget request settings: new inode, new direntry and changing parent
+ * directory inode.
+ */
+
+ dbg_gen("dent '%.*s' in dir ino %lu",
+ dentry->d_name.len, dentry->d_name.name, dir->i_ino);
+
+ if (!new_valid_dev(rdev))
+ return -EINVAL;
+
+ if (S_ISBLK(mode) || S_ISCHR(mode)) {
+ dev = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS);
+ if (!dev)
+ return -ENOMEM;
+ devlen = ubifs_encode_dev(dev, rdev);
+ }
+
+ err = ubifs_budget_space(c, &req);
+ if (err) {
+ kfree(dev);
+ return err;
+ }
+
+ inode = ubifs_new_inode(c, dir, mode);
+ if (IS_ERR(inode)) {
+ kfree(dev);
+ err = PTR_ERR(inode);
+ goto out_budg;
+ }
+
+ init_special_inode(inode, inode->i_mode, rdev);
+ inode->i_size = ubifs_inode(inode)->ui_size = devlen;
+ ui = ubifs_inode(inode);
+ ui->data = dev;
+ ui->data_len = devlen;
+
+ mutex_lock(&dir_ui->ui_mutex);
+ dir->i_size += sz_change;
+ dir_ui->ui_size = dir->i_size;
+ dir->i_mtime = dir->i_ctime = inode->i_ctime;
+ err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+ if (err)
+ goto out_cancel;
+ mutex_unlock(&dir_ui->ui_mutex);
+
+ ubifs_release_budget(c, &req);
+ insert_inode_hash(inode);
+ d_instantiate(dentry, inode);
+ return 0;
+
+out_cancel:
+ dir->i_size -= sz_change;
+ dir_ui->ui_size = dir->i_size;
+ mutex_unlock(&dir_ui->ui_mutex);
+ make_bad_inode(inode);
+ iput(inode);
+out_budg:
+ ubifs_release_budget(c, &req);
+ return err;
+}
+
+static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
+ const char *symname)
+{
+ struct inode *inode;
+ struct ubifs_inode *ui;
+ struct ubifs_inode *dir_ui = ubifs_inode(dir);
+ struct ubifs_info *c = dir->i_sb->s_fs_info;
+ int err, len = strlen(symname);
+ int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+ struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
+ .new_ino_d = ALIGN(len, 8),
+ .dirtied_ino = 1 };
+
+ /*
+ * Budget request settings: new inode, new direntry and changing parent
+ * directory inode.
+ */
+
+ dbg_gen("dent '%.*s', target '%s' in dir ino %lu", dentry->d_name.len,
+ dentry->d_name.name, symname, dir->i_ino);
+
+ if (len > UBIFS_MAX_INO_DATA)
+ return -ENAMETOOLONG;
+
+ err = ubifs_budget_space(c, &req);
+ if (err)
+ return err;
+
+ inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto out_budg;
+ }
+
+ ui = ubifs_inode(inode);
+ ui->data = kmalloc(len + 1, GFP_NOFS);
+ if (!ui->data) {
+ err = -ENOMEM;
+ goto out_inode;
+ }
+
+ memcpy(ui->data, symname, len);
+ ((char *)ui->data)[len] = '\0';
+ /*
+ * The terminating zero byte is not written to the flash media and it
+ * is put just to make later in-memory string processing simpler. Thus,
+ * data length is @len, not @len + %1.
+ */
+ ui->data_len = len;
+ inode->i_size = ubifs_inode(inode)->ui_size = len;
+
+ mutex_lock(&dir_ui->ui_mutex);
+ dir->i_size += sz_change;
+ dir_ui->ui_size = dir->i_size;
+ dir->i_mtime = dir->i_ctime = inode->i_ctime;
+ err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+ if (err)
+ goto out_cancel;
+ mutex_unlock(&dir_ui->ui_mutex);
+
+ ubifs_release_budget(c, &req);
+ insert_inode_hash(inode);
+ d_instantiate(dentry, inode);
+ return 0;
+
+out_cancel:
+ dir->i_size -= sz_change;
+ dir_ui->ui_size = dir->i_size;
+ mutex_unlock(&dir_ui->ui_mutex);
+out_inode:
+ make_bad_inode(inode);
+ iput(inode);
+out_budg:
+ ubifs_release_budget(c, &req);
+ return err;
+}
+
+/**
+ * lock_3_inodes - lock three UBIFS inodes for rename.
+ * @inode1: first inode
+ * @inode2: second inode
+ * @inode3: third inode
+ *
+ * For 'ubifs_rename()', @inode1 may be the same as @inode2 whereas @inode3 may
+ * be null.
+ */
+static void lock_3_inodes(struct inode *inode1, struct inode *inode2,
+ struct inode *inode3)
+{
+ struct inode *i1, *i2, *i3;
+
+ if (!inode3) {
+ if (inode1 != inode2) {
+ lock_2_inodes(inode1, inode2);
+ return;
+ }
+ mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
+ return;
+ }
+
+ if (inode1 == inode2) {
+ lock_2_inodes(inode1, inode3);
+ return;
+ }
+
+ /* 3 different inodes */
+ if (inode1 < inode2) {
+ i3 = inode2;
+ if (inode1 < inode3) {
+ i1 = inode1;
+ i2 = inode3;
+ } else {
+ i1 = inode3;
+ i2 = inode1;
+ }
+ } else {
+ i3 = inode1;
+ if (inode2 < inode3) {
+ i1 = inode2;
+ i2 = inode3;
+ } else {
+ i1 = inode3;
+ i2 = inode2;
+ }
+ }
+ mutex_lock_nested(&ubifs_inode(i1)->ui_mutex, WB_MUTEX_1);
+ lock_2_inodes(i2, i3);
+}
+
+/**
+ * unlock_3_inodes - unlock three UBIFS inodes for rename.
+ * @inode1: first inode
+ * @inode2: second inode
+ * @inode3: third inode
+ */
+static void unlock_3_inodes(struct inode *inode1, struct inode *inode2,
+ struct inode *inode3)
+{
+ mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
+ if (inode1 != inode2)
+ mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
+ if (inode3)
+ mutex_unlock(&ubifs_inode(inode3)->ui_mutex);
+}
+
+static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ struct ubifs_info *c = old_dir->i_sb->s_fs_info;
+ struct inode *old_inode = old_dentry->d_inode;
+ struct inode *new_inode = new_dentry->d_inode;
+ struct ubifs_inode *old_inode_ui = ubifs_inode(old_inode);
+ int err, release, sync = 0, move = (new_dir != old_dir);
+ int is_dir = S_ISDIR(old_inode->i_mode);
+ int unlink = !!new_inode;
+ int new_sz = CALC_DENT_SIZE(new_dentry->d_name.len);
+ int old_sz = CALC_DENT_SIZE(old_dentry->d_name.len);
+ struct ubifs_budget_req req = { .new_dent = 1, .mod_dent = 1,
+ .dirtied_ino = 3 };
+ struct ubifs_budget_req ino_req = { .dirtied_ino = 1,
+ .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
+ struct timespec time;
+
+ /*
+ * Budget request settings: deletion direntry, new direntry, removing
+ * the old inode, and changing old and new parent directory inodes.
+ *
+ * However, this operation also marks the target inode as dirty and
+ * does not write it, so we allocate budget for the target inode
+ * separately.
+ */
+
+ dbg_gen("dent '%.*s' ino %lu in dir ino %lu to dent '%.*s' in "
+ "dir ino %lu", old_dentry->d_name.len, old_dentry->d_name.name,
+ old_inode->i_ino, old_dir->i_ino, new_dentry->d_name.len,
+ new_dentry->d_name.name, new_dir->i_ino);
+
+ if (unlink && is_dir) {
+ err = check_dir_empty(c, new_inode);
+ if (err)
+ return err;
+ }
+
+ err = ubifs_budget_space(c, &req);
+ if (err)
+ return err;
+ err = ubifs_budget_space(c, &ino_req);
+ if (err) {
+ ubifs_release_budget(c, &req);
+ return err;
+ }
+
+ lock_3_inodes(old_dir, new_dir, new_inode);
+
+ /*
+ * Like most other Unix systems, set the @i_ctime for inodes on a
+ * rename.
+ */
+ time = ubifs_current_time(old_dir);
+ old_inode->i_ctime = time;
+
+ /* We must adjust parent link count when renaming directories */
+ if (is_dir) {
+ if (move) {
+ /*
+ * @old_dir loses a link because we are moving
+ * @old_inode to a different directory.
+ */
+ drop_nlink(old_dir);
+ /*
+ * @new_dir only gains a link if we are not also
+ * overwriting an existing directory.
+ */
+ if (!unlink)
+ inc_nlink(new_dir);
+ } else {
+ /*
+ * @old_inode is not moving to a different directory,
+ * but @old_dir still loses a link if we are
+ * overwriting an existing directory.
+ */
+ if (unlink)
+ drop_nlink(old_dir);
+ }
+ }
+
+ old_dir->i_size -= old_sz;
+ ubifs_inode(old_dir)->ui_size = old_dir->i_size;
+ old_dir->i_mtime = old_dir->i_ctime = time;
+ new_dir->i_mtime = new_dir->i_ctime = time;
+
+ /*
+ * And finally, if we unlinked a direntry which happened to have the
+ * same name as the moved direntry, we have to decrement @i_nlink of
+ * the unlinked inode and change its ctime.
+ */
+ if (unlink) {
+ /*
+ * Directories cannot have hard-links, so if this is a
+ * directory, decrement its @i_nlink twice because an empty
+ * directory has @i_nlink 2.
+ */
+ if (is_dir)
+ drop_nlink(new_inode);
+ new_inode->i_ctime = time;
+ drop_nlink(new_inode);
+ } else {
+ new_dir->i_size += new_sz;
+ ubifs_inode(new_dir)->ui_size = new_dir->i_size;
+ }
+
+ /*
+ * Do not ask 'ubifs_jnl_rename()' to flush write-buffer if @old_inode
+ * is dirty, because this will be done later on at the end of
+ * 'ubifs_rename()'.
+ */
+ if (IS_SYNC(old_inode)) {
+ sync = IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir);
+ if (unlink && IS_SYNC(new_inode))
+ sync = 1;
+ }
+ err = ubifs_jnl_rename(c, old_dir, old_dentry, new_dir, new_dentry,
+ sync);
+ if (err)
+ goto out_cancel;
+
+ unlock_3_inodes(old_dir, new_dir, new_inode);
+ ubifs_release_budget(c, &req);
+
+ mutex_lock(&old_inode_ui->ui_mutex);
+ release = old_inode_ui->dirty;
+ mark_inode_dirty_sync(old_inode);
+ mutex_unlock(&old_inode_ui->ui_mutex);
+
+ if (release)
+ ubifs_release_budget(c, &ino_req);
+ if (IS_SYNC(old_inode))
+ err = old_inode->i_sb->s_op->write_inode(old_inode, 1);
+ return err;
+
+out_cancel:
+ if (unlink) {
+ if (is_dir)
+ inc_nlink(new_inode);
+ inc_nlink(new_inode);
+ } else {
+ new_dir->i_size -= new_sz;
+ ubifs_inode(new_dir)->ui_size = new_dir->i_size;
+ }
+ old_dir->i_size += old_sz;
+ ubifs_inode(old_dir)->ui_size = old_dir->i_size;
+ if (is_dir) {
+ if (move) {
+ inc_nlink(old_dir);
+ if (!unlink)
+ drop_nlink(new_dir);
+ } else {
+ if (unlink)
+ inc_nlink(old_dir);
+ }
+ }
+ unlock_3_inodes(old_dir, new_dir, new_inode);
+ ubifs_release_budget(c, &ino_req);
+ ubifs_release_budget(c, &req);
+ return err;
+}
+
+int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ loff_t size;
+ struct inode *inode = dentry->d_inode;
+ struct ubifs_inode *ui = ubifs_inode(inode);
+
+ mutex_lock(&ui->ui_mutex);
+ stat->dev = inode->i_sb->s_dev;
+ stat->ino = inode->i_ino;
+ stat->mode = inode->i_mode;
+ stat->nlink = inode->i_nlink;
+ stat->uid = inode->i_uid;
+ stat->gid = inode->i_gid;
+ stat->rdev = inode->i_rdev;
+ stat->atime = inode->i_atime;
+ stat->mtime = inode->i_mtime;
+ stat->ctime = inode->i_ctime;
+ stat->blksize = UBIFS_BLOCK_SIZE;
+ stat->size = ui->ui_size;
+
+ /*
+ * Unfortunately, the 'stat()' system call was designed for block
+ * device based file systems, and it is not appropriate for UBIFS,
+ * because UBIFS does not have notion of "block". For example, it is
+ * difficult to tell how many block a directory takes - it actually
+ * takes less than 300 bytes, but we have to round it to block size,
+ * which introduces large mistake. This makes utilities like 'du' to
+ * report completely senseless numbers. This is the reason why UBIFS
+ * goes the same way as JFFS2 - it reports zero blocks for everything
+ * but regular files, which makes more sense than reporting completely
+ * wrong sizes.
+ */
+ if (S_ISREG(inode->i_mode)) {
+ size = ui->xattr_size;
+ size += stat->size;
+ size = ALIGN(size, UBIFS_BLOCK_SIZE);
+ /*
+ * Note, user-space expects 512-byte blocks count irrespectively
+ * of what was reported in @stat->size.
+ */
+ stat->blocks = size >> 9;
+ } else
+ stat->blocks = 0;
+ mutex_unlock(&ui->ui_mutex);
+ return 0;
+}
+
+struct inode_operations ubifs_dir_inode_operations = {
+ .lookup = ubifs_lookup,
+ .create = ubifs_create,
+ .link = ubifs_link,
+ .symlink = ubifs_symlink,
+ .unlink = ubifs_unlink,
+ .mkdir = ubifs_mkdir,
+ .rmdir = ubifs_rmdir,
+ .mknod = ubifs_mknod,
+ .rename = ubifs_rename,
+ .setattr = ubifs_setattr,
+ .getattr = ubifs_getattr,
+#ifdef CONFIG_UBIFS_FS_XATTR
+ .setxattr = ubifs_setxattr,
+ .getxattr = ubifs_getxattr,
+ .listxattr = ubifs_listxattr,
+ .removexattr = ubifs_removexattr,
+#endif
+};
+
+struct file_operations ubifs_dir_operations = {
+ .llseek = ubifs_dir_llseek,
+ .release = ubifs_dir_release,
+ .read = generic_read_dir,
+ .readdir = ubifs_readdir,
+ .fsync = ubifs_fsync,
+ .unlocked_ioctl = ubifs_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = ubifs_compat_ioctl,
+#endif
+};
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
new file mode 100644
index 00000000000..3d698e2022b
--- /dev/null
+++ b/fs/ubifs/file.c
@@ -0,0 +1,1290 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ * Adrian Hunter
+ */
+
+/*
+ * This file implements VFS file and inode operations of regular files, device
+ * nodes and symlinks as well as address space operations.
+ *
+ * UBIFS uses 2 page flags: PG_private and PG_checked. PG_private is set if the
+ * page is dirty and is used for budgeting purposes - dirty pages should not be
+ * budgeted. The PG_checked flag is set if full budgeting is required for the
+ * page e.g., when it corresponds to a file hole or it is just beyond the file
+ * size. The budgeting is done in 'ubifs_write_begin()', because it is OK to
+ * fail in this function, and the budget is released in 'ubifs_write_end()'. So
+ * the PG_private and PG_checked flags carry the information about how the page
+ * was budgeted, to make it possible to release the budget properly.
+ *
+ * A thing to keep in mind: inode's 'i_mutex' is locked in most VFS operations
+ * we implement. However, this is not true for '->writepage()', which might be
+ * called with 'i_mutex' unlocked. For example, when pdflush is performing
+ * write-back, it calls 'writepage()' with unlocked 'i_mutex', although the
+ * inode has 'I_LOCK' flag in this case. At "normal" work-paths 'i_mutex' is
+ * locked in '->writepage', e.g. in "sys_write -> alloc_pages -> direct reclaim
+ * path'. So, in '->writepage()' we are only guaranteed that the page is
+ * locked.
+ *
+ * Similarly, 'i_mutex' does not have to be locked in readpage(), e.g.,
+ * readahead path does not have it locked ("sys_read -> generic_file_aio_read
+ * -> ondemand_readahead -> readpage"). In case of readahead, 'I_LOCK' flag is
+ * not set as well. However, UBIFS disables readahead.
+ *
+ * This, for example means that there might be 2 concurrent '->writepage()'
+ * calls for the same inode, but different inode dirty pages.
+ */
+
+#include "ubifs.h"
+#include <linux/mount.h>
+#include <linux/namei.h>
+
+static int read_block(struct inode *inode, void *addr, unsigned int block,
+ struct ubifs_data_node *dn)
+{
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
+ int err, len, out_len;
+ union ubifs_key key;
+ unsigned int dlen;
+
+ data_key_init(c, &key, inode->i_ino, block);
+ err = ubifs_tnc_lookup(c, &key, dn);
+ if (err) {
+ if (err == -ENOENT)
+ /* Not found, so it must be a hole */
+ memset(addr, 0, UBIFS_BLOCK_SIZE);
+ return err;
+ }
+
+ ubifs_assert(dn->ch.sqnum > ubifs_inode(inode)->creat_sqnum);
+
+ len = le32_to_cpu(dn->size);
+ if (len <= 0 || len > UBIFS_BLOCK_SIZE)
+ goto dump;
+
+ dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
+ out_len = UBIFS_BLOCK_SIZE;
+ err = ubifs_decompress(&dn->data, dlen, addr, &out_len,
+ le16_to_cpu(dn->compr_type));
+ if (err || len != out_len)
+ goto dump;
+
+ /*
+ * Data length can be less than a full block, even for blocks that are
+ * not the last in the file (e.g., as a result of making a hole and
+ * appending data). Ensure that the remainder is zeroed out.
+ */
+ if (len < UBIFS_BLOCK_SIZE)
+ memset(addr + len, 0, UBIFS_BLOCK_SIZE - len);
+
+ return 0;
+
+dump:
+ ubifs_err("bad data node (block %u, inode %lu)",
+ block, inode->i_ino);
+ dbg_dump_node(c, dn);
+ return -EINVAL;
+}
+
+static int do_readpage(struct page *page)
+{
+ void *addr;
+ int err = 0, i;
+ unsigned int block, beyond;
+ struct ubifs_data_node *dn;
+ struct inode *inode = page->mapping->host;
+ loff_t i_size = i_size_read(inode);
+
+ dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx",
+ inode->i_ino, page->index, i_size, page->flags);
+ ubifs_assert(!PageChecked(page));
+ ubifs_assert(!PagePrivate(page));
+
+ addr = kmap(page);
+
+ block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
+ beyond = (i_size + UBIFS_BLOCK_SIZE - 1) >> UBIFS_BLOCK_SHIFT;
+ if (block >= beyond) {
+ /* Reading beyond inode */
+ SetPageChecked(page);
+ memset(addr, 0, PAGE_CACHE_SIZE);
+ goto out;
+ }
+
+ dn = kmalloc(UBIFS_MAX_DATA_NODE_SZ, GFP_NOFS);
+ if (!dn) {
+ err = -ENOMEM;
+ goto error;
+ }
+
+ i = 0;
+ while (1) {
+ int ret;
+
+ if (block >= beyond) {
+ /* Reading beyond inode */
+ err = -ENOENT;
+ memset(addr, 0, UBIFS_BLOCK_SIZE);
+ } else {
+ ret = read_block(inode, addr, block, dn);
+ if (ret) {
+ err = ret;
+ if (err != -ENOENT)
+ break;
+ }
+ }
+ if (++i >= UBIFS_BLOCKS_PER_PAGE)
+ break;
+ block += 1;
+ addr += UBIFS_BLOCK_SIZE;
+ }
+ if (err) {
+ if (err == -ENOENT) {
+ /* Not found, so it must be a hole */
+ SetPageChecked(page);
+ dbg_gen("hole");
+ goto out_free;
+ }
+ ubifs_err("cannot read page %lu of inode %lu, error %d",
+ page->index, inode->i_ino, err);
+ goto error;
+ }
+
+out_free:
+ kfree(dn);
+out:
+ SetPageUptodate(page);
+ ClearPageError(page);
+ flush_dcache_page(page);
+ kunmap(page);
+ return 0;
+
+error:
+ kfree(dn);
+ ClearPageUptodate(page);
+ SetPageError(page);
+ flush_dcache_page(page);
+ kunmap(page);
+ return err;
+}
+
+/**
+ * release_new_page_budget - release budget of a new page.
+ * @c: UBIFS file-system description object
+ *
+ * This is a helper function which releases budget corresponding to the budget
+ * of one new page of data.
+ */
+static void release_new_page_budget(struct ubifs_info *c)
+{
+ struct ubifs_budget_req req = { .recalculate = 1, .new_page = 1 };
+
+ ubifs_release_budget(c, &req);
+}
+
+/**
+ * release_existing_page_budget - release budget of an existing page.
+ * @c: UBIFS file-system description object
+ *
+ * This is a helper function which releases budget corresponding to the budget
+ * of changing one one page of data which already exists on the flash media.
+ */
+static void release_existing_page_budget(struct ubifs_info *c)
+{
+ struct ubifs_budget_req req = { .dd_growth = c->page_budget};
+
+ ubifs_release_budget(c, &req);
+}
+
+static int write_begin_slow(struct address_space *mapping,
+ loff_t pos, unsigned len, struct page **pagep)
+{
+ struct inode *inode = mapping->host;
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
+ pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ struct ubifs_budget_req req = { .new_page = 1 };
+ int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
+ struct page *page;
+
+ dbg_gen("ino %lu, pos %llu, len %u, i_size %lld",
+ inode->i_ino, pos, len, inode->i_size);
+
+ /*
+ * At the slow path we have to budget before locking the page, because
+ * budgeting may force write-back, which would wait on locked pages and
+ * deadlock if we had the page locked. At this point we do not know
+ * anything about the page, so assume that this is a new page which is
+ * written to a hole. This corresponds to largest budget. Later the
+ * budget will be amended if this is not true.
+ */
+ if (appending)
+ /* We are appending data, budget for inode change */
+ req.dirtied_ino = 1;
+
+ err = ubifs_budget_space(c, &req);
+ if (unlikely(err))
+ return err;
+
+ page = __grab_cache_page(mapping, index);
+ if (unlikely(!page)) {
+ ubifs_release_budget(c, &req);
+ return -ENOMEM;
+ }
+
+ if (!PageUptodate(page)) {
+ if (!(pos & PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
+ SetPageChecked(page);
+ else {
+ err = do_readpage(page);
+ if (err) {
+ unlock_page(page);
+ page_cache_release(page);
+ return err;
+ }
+ }
+
+ SetPageUptodate(page);
+ ClearPageError(page);
+ }
+
+ if (PagePrivate(page))
+ /*
+ * The page is dirty, which means it was budgeted twice:
+ * o first time the budget was allocated by the task which
+ * made the page dirty and set the PG_private flag;
+ * o and then we budgeted for it for the second time at the
+ * very beginning of this function.
+ *
+ * So what we have to do is to release the page budget we
+ * allocated.
+ */
+ release_new_page_budget(c);
+ else if (!PageChecked(page))
+ /*
+ * We are changing a page which already exists on the media.
+ * This means that changing the page does not make the amount
+ * of indexing information larger, and this part of the budget
+ * which we have already acquired may be released.
+ */
+ ubifs_convert_page_budget(c);
+
+ if (appending) {
+ struct ubifs_inode *ui = ubifs_inode(inode);
+
+ /*
+ * 'ubifs_write_end()' is optimized from the fast-path part of
+ * 'ubifs_write_begin()' and expects the @ui_mutex to be locked
+ * if data is appended.
+ */
+ mutex_lock(&ui->ui_mutex);
+ if (ui->dirty)
+ /*
+ * The inode is dirty already, so we may free the
+ * budget we allocated.
+ */
+ ubifs_release_dirty_inode_budget(c, ui);
+ }
+
+ *pagep = page;
+ return 0;
+}
+
+/**
+ * allocate_budget - allocate budget for 'ubifs_write_begin()'.
+ * @c: UBIFS file-system description object
+ * @page: page to allocate budget for
+ * @ui: UBIFS inode object the page belongs to
+ * @appending: non-zero if the page is appended
+ *
+ * This is a helper function for 'ubifs_write_begin()' which allocates budget
+ * for the operation. The budget is allocated differently depending on whether
+ * this is appending, whether the page is dirty or not, and so on. This
+ * function leaves the @ui->ui_mutex locked in case of appending. Returns zero
+ * in case of success and %-ENOSPC in case of failure.
+ */
+static int allocate_budget(struct ubifs_info *c, struct page *page,
+ struct ubifs_inode *ui, int appending)
+{
+ struct ubifs_budget_req req = { .fast = 1 };
+
+ if (PagePrivate(page)) {
+ if (!appending)
+ /*
+ * The page is dirty and we are not appending, which
+ * means no budget is needed at all.
+ */
+ return 0;
+
+ mutex_lock(&ui->ui_mutex);
+ if (ui->dirty)
+ /*
+ * The page is dirty and we are appending, so the inode
+ * has to be marked as dirty. However, it is already
+ * dirty, so we do not need any budget. We may return,
+ * but @ui->ui_mutex hast to be left locked because we
+ * should prevent write-back from flushing the inode
+ * and freeing the budget. The lock will be released in
+ * 'ubifs_write_end()'.
+ */
+ return 0;
+
+ /*
+ * The page is dirty, we are appending, the inode is clean, so
+ * we need to budget the inode change.
+ */
+ req.dirtied_ino = 1;
+ } else {
+ if (PageChecked(page))
+ /*
+ * The page corresponds to a hole and does not
+ * exist on the media. So changing it makes
+ * make the amount of indexing information
+ * larger, and we have to budget for a new
+ * page.
+ */
+ req.new_page = 1;
+ else
+ /*
+ * Not a hole, the change will not add any new
+ * indexing information, budget for page
+ * change.
+ */
+ req.dirtied_page = 1;
+
+ if (appending) {
+ mutex_lock(&ui->ui_mutex);
+ if (!ui->dirty)
+ /*
+ * The inode is clean but we will have to mark
+ * it as dirty because we are appending. This
+ * needs a budget.
+ */
+ req.dirtied_ino = 1;
+ }
+ }
+
+ return ubifs_budget_space(c, &req);
+}
+
+/*
+ * This function is called when a page of data is going to be written. Since
+ * the page of data will not necessarily go to the flash straight away, UBIFS
+ * has to reserve space on the media for it, which is done by means of
+ * budgeting.
+ *
+ * This is the hot-path of the file-system and we are trying to optimize it as
+ * much as possible. For this reasons it is split on 2 parts - slow and fast.
+ *
+ * There many budgeting cases:
+ * o a new page is appended - we have to budget for a new page and for
+ * changing the inode; however, if the inode is already dirty, there is
+ * no need to budget for it;
+ * o an existing clean page is changed - we have budget for it; if the page
+ * does not exist on the media (a hole), we have to budget for a new
+ * page; otherwise, we may budget for changing an existing page; the
+ * difference between these cases is that changing an existing page does
+ * not introduce anything new to the FS indexing information, so it does
+ * not grow, and smaller budget is acquired in this case;
+ * o an existing dirty page is changed - no need to budget at all, because
+ * the page budget has been acquired by earlier, when the page has been
+ * marked dirty.
+ *
+ * UBIFS budgeting sub-system may force write-back if it thinks there is no
+ * space to reserve. This imposes some locking restrictions and makes it
+ * impossible to take into account the above cases, and makes it impossible to
+ * optimize budgeting.
+ *
+ * The solution for this is that the fast path of 'ubifs_write_begin()' assumes
+ * there is a plenty of flash space and the budget will be acquired quickly,
+ * without forcing write-back. The slow path does not make this assumption.
+ */
+static int ubifs_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ struct inode *inode = mapping->host;
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
+ struct ubifs_inode *ui = ubifs_inode(inode);
+ pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
+ struct page *page;
+
+
+ ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size);
+
+ if (unlikely(c->ro_media))
+ return -EROFS;
+
+ /* Try out the fast-path part first */
+ page = __grab_cache_page(mapping, index);
+ if (unlikely(!page))
+ return -ENOMEM;
+
+ if (!PageUptodate(page)) {
+ /* The page is not loaded from the flash */
+ if (!(pos & PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
+ /*
+ * We change whole page so no need to load it. But we
+ * have to set the @PG_checked flag to make the further
+ * code the page is new. This might be not true, but it
+ * is better to budget more that to read the page from
+ * the media.
+ */
+ SetPageChecked(page);
+ else {
+ err = do_readpage(page);
+ if (err) {
+ unlock_page(page);
+ page_cache_release(page);
+ return err;
+ }
+ }
+
+ SetPageUptodate(page);
+ ClearPageError(page);
+ }
+
+ err = allocate_budget(c, page, ui, appending);
+ if (unlikely(err)) {
+ ubifs_assert(err == -ENOSPC);
+ /*
+ * Budgeting failed which means it would have to force
+ * write-back but didn't, because we set the @fast flag in the
+ * request. Write-back cannot be done now, while we have the
+ * page locked, because it would deadlock. Unlock and free
+ * everything and fall-back to slow-path.
+ */
+ if (appending) {
+ ubifs_assert(mutex_is_locked(&ui->ui_mutex));
+ mutex_unlock(&ui->ui_mutex);
+ }
+ unlock_page(page);
+ page_cache_release(page);
+
+ return write_begin_slow(mapping, pos, len, pagep);
+ }
+
+ /*
+ * Whee, we aquired budgeting quickly - without involving
+ * garbage-collection, committing or forceing write-back. We return
+ * with @ui->ui_mutex locked if we are appending pages, and unlocked
+ * otherwise. This is an optimization (slightly hacky though).
+ */
+ *pagep = page;
+ return 0;
+
+}
+
+/**
+ * cancel_budget - cancel budget.
+ * @c: UBIFS file-system description object
+ * @page: page to cancel budget for
+ * @ui: UBIFS inode object the page belongs to
+ * @appending: non-zero if the page is appended
+ *
+ * This is a helper function for a page write operation. It unlocks the
+ * @ui->ui_mutex in case of appending.
+ */
+static void cancel_budget(struct ubifs_info *c, struct page *page,
+ struct ubifs_inode *ui, int appending)
+{
+ if (appending) {
+ if (!ui->dirty)
+ ubifs_release_dirty_inode_budget(c, ui);
+ mutex_unlock(&ui->ui_mutex);
+ }
+ if (!PagePrivate(page)) {
+ if (PageChecked(page))
+ release_new_page_budget(c);
+ else
+ release_existing_page_budget(c);
+ }
+}
+
+static int ubifs_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ struct inode *inode = mapping->host;
+ struct ubifs_inode *ui = ubifs_inode(inode);
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
+ loff_t end_pos = pos + len;
+ int appending = !!(end_pos > inode->i_size);
+
+ dbg_gen("ino %lu, pos %llu, pg %lu, len %u, copied %d, i_size %lld",
+ inode->i_ino, pos, page->index, len, copied, inode->i_size);
+
+ if (unlikely(copied < len && len == PAGE_CACHE_SIZE)) {
+ /*
+ * VFS copied less data to the page that it intended and
+ * declared in its '->write_begin()' call via the @len
+ * argument. If the page was not up-to-date, and @len was
+ * @PAGE_CACHE_SIZE, the 'ubifs_write_begin()' function did
+ * not load it from the media (for optimization reasons). This
+ * means that part of the page contains garbage. So read the
+ * page now.
+ */
+ dbg_gen("copied %d instead of %d, read page and repeat",
+ copied, len);
+ cancel_budget(c, page, ui, appending);
+
+ /*
+ * Return 0 to force VFS to repeat the whole operation, or the
+ * error code if 'do_readpage()' failes.
+ */
+ copied = do_readpage(page);
+ goto out;
+ }
+
+ if (!PagePrivate(page)) {
+ SetPagePrivate(page);
+ atomic_long_inc(&c->dirty_pg_cnt);
+ __set_page_dirty_nobuffers(page);
+ }
+
+ if (appending) {
+ i_size_write(inode, end_pos);
+ ui->ui_size = end_pos;
+ /*
+ * Note, we do not set @I_DIRTY_PAGES (which means that the
+ * inode has dirty pages), this has been done in
+ * '__set_page_dirty_nobuffers()'.
+ */
+ __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+ ubifs_assert(mutex_is_locked(&ui->ui_mutex));
+ mutex_unlock(&ui->ui_mutex);
+ }
+
+out:
+ unlock_page(page);
+ page_cache_release(page);
+ return copied;
+}
+
+static int ubifs_readpage(struct file *file, struct page *page)
+{
+ do_readpage(page);
+ unlock_page(page);
+ return 0;
+}
+
+static int do_writepage(struct page *page, int len)
+{
+ int err = 0, i, blen;
+ unsigned int block;
+ void *addr;
+ union ubifs_key key;
+ struct inode *inode = page->mapping->host;
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
+
+#ifdef UBIFS_DEBUG
+ spin_lock(&ui->ui_lock);
+ ubifs_assert(page->index <= ui->synced_i_size << PAGE_CACHE_SIZE);
+ spin_unlock(&ui->ui_lock);
+#endif
+
+ /* Update radix tree tags */
+ set_page_writeback(page);
+
+ addr = kmap(page);
+ block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
+ i = 0;
+ while (len) {
+ blen = min_t(int, len, UBIFS_BLOCK_SIZE);
+ data_key_init(c, &key, inode->i_ino, block);
+ err = ubifs_jnl_write_data(c, inode, &key, addr, blen);
+ if (err)
+ break;
+ if (++i >= UBIFS_BLOCKS_PER_PAGE)
+ break;
+ block += 1;
+ addr += blen;
+ len -= blen;
+ }
+ if (err) {
+ SetPageError(page);
+ ubifs_err("cannot write page %lu of inode %lu, error %d",
+ page->index, inode->i_ino, err);
+ ubifs_ro_mode(c, err);
+ }
+
+ ubifs_assert(PagePrivate(page));
+ if (PageChecked(page))
+ release_new_page_budget(c);
+ else
+ release_existing_page_budget(c);
+
+ atomic_long_dec(&c->dirty_pg_cnt);
+ ClearPagePrivate(page);
+ ClearPageChecked(page);
+
+ kunmap(page);
+ unlock_page(page);
+ end_page_writeback(page);
+ return err;
+}
+
+/*
+ * When writing-back dirty inodes, VFS first writes-back pages belonging to the
+ * inode, then the inode itself. For UBIFS this may cause a problem. Consider a
+ * situation when a we have an inode with size 0, then a megabyte of data is
+ * appended to the inode, then write-back starts and flushes some amount of the
+ * dirty pages, the journal becomes full, commit happens and finishes, and then
+ * an unclean reboot happens. When the file system is mounted next time, the
+ * inode size would still be 0, but there would be many pages which are beyond
+ * the inode size, they would be indexed and consume flash space. Because the
+ * journal has been committed, the replay would not be able to detect this
+ * situation and correct the inode size. This means UBIFS would have to scan
+ * whole index and correct all inode sizes, which is long an unacceptable.
+ *
+ * To prevent situations like this, UBIFS writes pages back only if they are
+ * within last synchronized inode size, i.e. the the size which has been
+ * written to the flash media last time. Otherwise, UBIFS forces inode
+ * write-back, thus making sure the on-flash inode contains current inode size,
+ * and then keeps writing pages back.
+ *
+ * Some locking issues explanation. 'ubifs_writepage()' first is called with
+ * the page locked, and it locks @ui_mutex. However, write-back does take inode
+ * @i_mutex, which means other VFS operations may be run on this inode at the
+ * same time. And the problematic one is truncation to smaller size, from where
+ * we have to call 'vmtruncate()', which first changes @inode->i_size, then
+ * drops the truncated pages. And while dropping the pages, it takes the page
+ * lock. This means that 'do_truncation()' cannot call 'vmtruncate()' with
+ * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This
+ * means that @inode->i_size is changed while @ui_mutex is unlocked.
+ *
+ * But in 'ubifs_writepage()' we have to guarantee that we do not write beyond
+ * inode size. How do we do this if @inode->i_size may became smaller while we
+ * are in the middle of 'ubifs_writepage()'? The UBIFS solution is the
+ * @ui->ui_isize "shadow" field which UBIFS uses instead of @inode->i_size
+ * internally and updates it under @ui_mutex.
+ *
+ * Q: why we do not worry that if we race with truncation, we may end up with a
+ * situation when the inode is truncated while we are in the middle of
+ * 'do_writepage()', so we do write beyond inode size?
+ * A: If we are in the middle of 'do_writepage()', truncation would be locked
+ * on the page lock and it would not write the truncated inode node to the
+ * journal before we have finished.
+ */
+static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
+{
+ struct inode *inode = page->mapping->host;
+ struct ubifs_inode *ui = ubifs_inode(inode);
+ loff_t i_size = i_size_read(inode), synced_i_size;
+ pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+ int err, len = i_size & (PAGE_CACHE_SIZE - 1);
+ void *kaddr;
+
+ dbg_gen("ino %lu, pg %lu, pg flags %#lx",
+ inode->i_ino, page->index, page->flags);
+ ubifs_assert(PagePrivate(page));
+
+ /* Is the page fully outside @i_size? (truncate in progress) */
+ if (page->index > end_index || (page->index == end_index && !len)) {
+ err = 0;
+ goto out_unlock;
+ }
+
+ spin_lock(&ui->ui_lock);
+ synced_i_size = ui->synced_i_size;
+ spin_unlock(&ui->ui_lock);
+
+ /* Is the page fully inside @i_size? */
+ if (page->index < end_index) {
+ if (page->index >= synced_i_size >> PAGE_CACHE_SHIFT) {
+ err = inode->i_sb->s_op->write_inode(inode, 1);
+ if (err)
+ goto out_unlock;
+ /*
+ * The inode has been written, but the write-buffer has
+ * not been synchronized, so in case of an unclean
+ * reboot we may end up with some pages beyond inode
+ * size, but they would be in the journal (because
+ * commit flushes write buffers) and recovery would deal
+ * with this.
+ */
+ }
+ return do_writepage(page, PAGE_CACHE_SIZE);
+ }
+
+ /*
+ * The page straddles @i_size. It must be zeroed out on each and every
+ * writepage invocation because it may be mmapped. "A file is mapped
+ * in multiples of the page size. For a file that is not a multiple of
+ * the page size, the remaining memory is zeroed when mapped, and
+ * writes to that region are not written out to the file."
+ */
+ kaddr = kmap_atomic(page, KM_USER0);
+ memset(kaddr + len, 0, PAGE_CACHE_SIZE - len);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr, KM_USER0);
+
+ if (i_size > synced_i_size) {
+ err = inode->i_sb->s_op->write_inode(inode, 1);
+ if (err)
+ goto out_unlock;
+ }
+
+ return do_writepage(page, len);
+
+out_unlock:
+ unlock_page(page);
+ return err;
+}
+
+/**
+ * do_attr_changes - change inode attributes.
+ * @inode: inode to change attributes for
+ * @attr: describes attributes to change
+ */
+static void do_attr_changes(struct inode *inode, const struct iattr *attr)
+{
+ if (attr->ia_valid & ATTR_UID)
+ inode->i_uid = attr->ia_uid;
+ if (attr->ia_valid & ATTR_GID)
+ inode->i_gid = attr->ia_gid;
+ if (attr->ia_valid & ATTR_ATIME)
+ inode->i_atime = timespec_trunc(attr->ia_atime,
+ inode->i_sb->s_time_gran);
+ if (attr->ia_valid & ATTR_MTIME)
+ inode->i_mtime = timespec_trunc(attr->ia_mtime,
+ inode->i_sb->s_time_gran);
+ if (attr->ia_valid & ATTR_CTIME)
+ inode->i_ctime = timespec_trunc(attr->ia_ctime,
+ inode->i_sb->s_time_gran);
+ if (attr->ia_valid & ATTR_MODE) {
+ umode_t mode = attr->ia_mode;
+
+ if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+ mode &= ~S_ISGID;
+ inode->i_mode = mode;
+ }
+}
+
+/**
+ * do_truncation - truncate an inode.
+ * @c: UBIFS file-system description object
+ * @inode: inode to truncate
+ * @attr: inode attribute changes description
+ *
+ * This function implements VFS '->setattr()' call when the inode is truncated
+ * to a smaller size. Returns zero in case of success and a negative error code
+ * in case of failure.
+ */
+static int do_truncation(struct ubifs_info *c, struct inode *inode,
+ const struct iattr *attr)
+{
+ int err;
+ struct ubifs_budget_req req;
+ loff_t old_size = inode->i_size, new_size = attr->ia_size;
+ int offset = new_size & (UBIFS_BLOCK_SIZE - 1), budgeted = 1;
+ struct ubifs_inode *ui = ubifs_inode(inode);
+
+ dbg_gen("ino %lu, size %lld -> %lld", inode->i_ino, old_size, new_size);
+ memset(&req, 0, sizeof(struct ubifs_budget_req));
+
+ /*
+ * If this is truncation to a smaller size, and we do not truncate on a
+ * block boundary, budget for changing one data block, because the last
+ * block will be re-written.
+ */
+ if (new_size & (UBIFS_BLOCK_SIZE - 1))
+ req.dirtied_page = 1;
+
+ req.dirtied_ino = 1;
+ /* A funny way to budget for truncation node */
+ req.dirtied_ino_d = UBIFS_TRUN_NODE_SZ;
+ err = ubifs_budget_space(c, &req);
+ if (err) {
+ /*
+ * Treat truncations to zero as deletion and always allow them,
+ * just like we do for '->unlink()'.
+ */
+ if (new_size || err != -ENOSPC)
+ return err;
+ budgeted = 0;
+ }
+
+ err = vmtruncate(inode, new_size);
+ if (err)
+ goto out_budg;
+
+ if (offset) {
+ pgoff_t index = new_size >> PAGE_CACHE_SHIFT;
+ struct page *page;
+
+ page = find_lock_page(inode->i_mapping, index);
+ if (page) {
+ if (PageDirty(page)) {
+ /*
+ * 'ubifs_jnl_truncate()' will try to truncate
+ * the last data node, but it contains
+ * out-of-date data because the page is dirty.
+ * Write the page now, so that
+ * 'ubifs_jnl_truncate()' will see an already
+ * truncated (and up to date) data node.
+ */
+ ubifs_assert(PagePrivate(page));
+
+ clear_page_dirty_for_io(page);
+ if (UBIFS_BLOCKS_PER_PAGE_SHIFT)
+ offset = new_size &
+ (PAGE_CACHE_SIZE - 1);
+ err = do_writepage(page, offset);
+ page_cache_release(page);
+ if (err)
+ goto out_budg;
+ /*
+ * We could now tell 'ubifs_jnl_truncate()' not
+ * to read the last block.
+ */
+ } else {
+ /*
+ * We could 'kmap()' the page and pass the data
+ * to 'ubifs_jnl_truncate()' to save it from
+ * having to read it.
+ */
+ unlock_page(page);
+ page_cache_release(page);
+ }
+ }
+ }
+
+ mutex_lock(&ui->ui_mutex);
+ ui->ui_size = inode->i_size;
+ /* Truncation changes inode [mc]time */
+ inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
+ /* The other attributes may be changed at the same time as well */
+ do_attr_changes(inode, attr);
+
+ err = ubifs_jnl_truncate(c, inode, old_size, new_size);
+ mutex_unlock(&ui->ui_mutex);
+out_budg:
+ if (budgeted)
+ ubifs_release_budget(c, &req);
+ else {
+ c->nospace = c->nospace_rp = 0;
+ smp_wmb();
+ }
+ return err;
+}
+
+/**
+ * do_setattr - change inode attributes.
+ * @c: UBIFS file-system description object
+ * @inode: inode to change attributes for
+ * @attr: inode attribute changes description
+ *
+ * This function implements VFS '->setattr()' call for all cases except
+ * truncations to smaller size. Returns zero in case of success and a negative
+ * error code in case of failure.
+ */
+static int do_setattr(struct ubifs_info *c, struct inode *inode,
+ const struct iattr *attr)
+{
+ int err, release;
+ loff_t new_size = attr->ia_size;
+ struct ubifs_inode *ui = ubifs_inode(inode);
+ struct ubifs_budget_req req = { .dirtied_ino = 1,
+ .dirtied_ino_d = ALIGN(ui->data_len, 8) };
+
+ err = ubifs_budget_space(c, &req);
+ if (err)
+ return err;
+
+ if (attr->ia_valid & ATTR_SIZE) {
+ dbg_gen("size %lld -> %lld", inode->i_size, new_size);
+ err = vmtruncate(inode, new_size);
+ if (err)
+ goto out;
+ }
+
+ mutex_lock(&ui->ui_mutex);
+ if (attr->ia_valid & ATTR_SIZE) {
+ /* Truncation changes inode [mc]time */
+ inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
+ /* 'vmtruncate()' changed @i_size, update @ui_size */
+ ui->ui_size = inode->i_size;
+ }
+
+ do_attr_changes(inode, attr);
+
+ release = ui->dirty;
+ if (attr->ia_valid & ATTR_SIZE)
+ /*
+ * Inode length changed, so we have to make sure
+ * @I_DIRTY_DATASYNC is set.
+ */
+ __mark_inode_dirty(inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+ else
+ mark_inode_dirty_sync(inode);
+ mutex_unlock(&ui->ui_mutex);
+
+ if (release)
+ ubifs_release_budget(c, &req);
+ if (IS_SYNC(inode))
+ err = inode->i_sb->s_op->write_inode(inode, 1);
+ return err;
+
+out:
+ ubifs_release_budget(c, &req);
+ return err;
+}
+
+int ubifs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+ int err;
+ struct inode *inode = dentry->d_inode;
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
+
+ dbg_gen("ino %lu, mode %#x, ia_valid %#x",
+ inode->i_ino, inode->i_mode, attr->ia_valid);
+ err = inode_change_ok(inode, attr);
+ if (err)
+ return err;
+
+ err = dbg_check_synced_i_size(inode);
+ if (err)
+ return err;
+
+ if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size < inode->i_size)
+ /* Truncation to a smaller size */
+ err = do_truncation(c, inode, attr);
+ else
+ err = do_setattr(c, inode, attr);
+
+ return err;
+}
+
+static void ubifs_invalidatepage(struct page *page, unsigned long offset)
+{
+ struct inode *inode = page->mapping->host;
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
+
+ ubifs_assert(PagePrivate(page));
+ if (offset)
+ /* Partial page remains dirty */
+ return;
+
+ if (PageChecked(page))
+ release_new_page_budget(c);
+ else
+ release_existing_page_budget(c);
+
+ atomic_long_dec(&c->dirty_pg_cnt);
+ ClearPagePrivate(page);
+ ClearPageChecked(page);
+}
+
+static void *ubifs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+ struct ubifs_inode *ui = ubifs_inode(dentry->d_inode);
+
+ nd_set_link(nd, ui->data);
+ return NULL;
+}
+
+int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+ struct inode *inode = dentry->d_inode;
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
+ int err;
+
+ dbg_gen("syncing inode %lu", inode->i_ino);
+
+ /*
+ * VFS has already synchronized dirty pages for this inode. Synchronize
+ * the inode unless this is a 'datasync()' call.
+ */
+ if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) {
+ err = inode->i_sb->s_op->write_inode(inode, 1);
+ if (err)
+ return err;
+ }
+
+ /*
+ * Nodes related to this inode may still sit in a write-buffer. Flush
+ * them.
+ */
+ err = ubifs_sync_wbufs_by_inode(c, inode);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+/**
+ * mctime_update_needed - check if mtime or ctime update is needed.
+ * @inode: the inode to do the check for
+ * @now: current time
+ *
+ * This helper function checks if the inode mtime/ctime should be updated or
+ * not. If current values of the time-stamps are within the UBIFS inode time
+ * granularity, they are not updated. This is an optimization.
+ */
+static inline int mctime_update_needed(const struct inode *inode,
+ const struct timespec *now)
+{
+ if (!timespec_equal(&inode->i_mtime, now) ||
+ !timespec_equal(&inode->i_ctime, now))
+ return 1;
+ return 0;
+}
+
+/**
+ * update_ctime - update mtime and ctime of an inode.
+ * @c: UBIFS file-system description object
+ * @inode: inode to update
+ *
+ * This function updates mtime and ctime of the inode if it is not equivalent to
+ * current time. Returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int update_mctime(struct ubifs_info *c, struct inode *inode)
+{
+ struct timespec now = ubifs_current_time(inode);
+ struct ubifs_inode *ui = ubifs_inode(inode);
+
+ if (mctime_update_needed(inode, &now)) {
+ int err, release;
+ struct ubifs_budget_req req = { .dirtied_ino = 1,
+ .dirtied_ino_d = ALIGN(ui->data_len, 8) };
+
+ err = ubifs_budget_space(c, &req);
+ if (err)
+ return err;
+
+ mutex_lock(&ui->ui_mutex);
+ inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
+ release = ui->dirty;
+ mark_inode_dirty_sync(inode);
+ mutex_unlock(&ui->ui_mutex);
+ if (release)
+ ubifs_release_budget(c, &req);
+ }
+
+ return 0;
+}
+
+static ssize_t ubifs_aio_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ int err;
+ ssize_t ret;
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
+
+ err = update_mctime(c, inode);
+ if (err)
+ return err;
+
+ ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+ if (ret < 0)
+ return ret;
+
+ if (ret > 0 && (IS_SYNC(inode) || iocb->ki_filp->f_flags & O_SYNC)) {
+ err = ubifs_sync_wbufs_by_inode(c, inode);
+ if (err)
+ return err;
+ }
+
+ return ret;
+}
+
+static int ubifs_set_page_dirty(struct page *page)
+{
+ int ret;
+
+ ret = __set_page_dirty_nobuffers(page);
+ /*
+ * An attempt to dirty a page without budgeting for it - should not
+ * happen.
+ */
+ ubifs_assert(ret == 0);
+ return ret;
+}
+
+static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags)
+{
+ /*
+ * An attempt to release a dirty page without budgeting for it - should
+ * not happen.
+ */
+ if (PageWriteback(page))
+ return 0;
+ ubifs_assert(PagePrivate(page));
+ ubifs_assert(0);
+ ClearPagePrivate(page);
+ ClearPageChecked(page);
+ return 1;
+}
+
+/*
+ * mmap()d file has taken write protection fault and is being made
+ * writable. UBIFS must ensure page is budgeted for.
+ */
+static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+ struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
+ struct timespec now = ubifs_current_time(inode);
+ struct ubifs_budget_req req = { .new_page = 1 };
+ int err, update_time;
+
+ dbg_gen("ino %lu, pg %lu, i_size %lld", inode->i_ino, page->index,
+ i_size_read(inode));
+ ubifs_assert(!(inode->i_sb->s_flags & MS_RDONLY));
+
+ if (unlikely(c->ro_media))
+ return -EROFS;
+
+ /*
+ * We have not locked @page so far so we may budget for changing the
+ * page. Note, we cannot do this after we locked the page, because
+ * budgeting may cause write-back which would cause deadlock.
+ *
+ * At the moment we do not know whether the page is dirty or not, so we
+ * assume that it is not and budget for a new page. We could look at
+ * the @PG_private flag and figure this out, but we may race with write
+ * back and the page state may change by the time we lock it, so this
+ * would need additional care. We do not bother with this at the
+ * moment, although it might be good idea to do. Instead, we allocate
+ * budget for a new page and amend it later on if the page was in fact
+ * dirty.
+ *
+ * The budgeting-related logic of this function is similar to what we
+ * do in 'ubifs_write_begin()' and 'ubifs_write_end()'. Glance there
+ * for more comments.
+ */
+ update_time = mctime_update_needed(inode, &now);
+ if (update_time)
+ /*
+ * We have to change inode time stamp which requires extra
+ * budgeting.
+ */
+ req.dirtied_ino = 1;
+
+ err = ubifs_budget_space(c, &req);
+ if (unlikely(err)) {
+ if (err == -ENOSPC)
+ ubifs_warn("out of space for mmapped file "
+ "(inode number %lu)", inode->i_ino);
+ return err;
+ }
+
+ lock_page(page);
+ if (unlikely(page->mapping != inode->i_mapping ||
+ page_offset(page) > i_size_read(inode))) {
+ /* Page got truncated out from underneath us */
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
+ if (PagePrivate(page))
+ release_new_page_budget(c);
+ else {
+ if (!PageChecked(page))
+ ubifs_convert_page_budget(c);
+ SetPagePrivate(page);
+ atomic_long_inc(&c->dirty_pg_cnt);
+ __set_page_dirty_nobuffers(page);
+ }
+
+ if (update_time) {
+ int release;
+ struct ubifs_inode *ui = ubifs_inode(inode);
+
+ mutex_lock(&ui->ui_mutex);
+ inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
+ release = ui->dirty;
+ mark_inode_dirty_sync(inode);
+ mutex_unlock(&ui->ui_mutex);
+ if (release)
+ ubifs_release_dirty_inode_budget(c, ui);
+ }
+
+ unlock_page(page);
+ return 0;
+
+out_unlock:
+ unlock_page(page);
+ ubifs_release_budget(c, &req);
+ return err;
+}
+
+static struct vm_operations_struct ubifs_file_vm_ops = {
+ .fault = filemap_fault,
+ .page_mkwrite = ubifs_vm_page_mkwrite,
+};
+
+static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ int err;
+
+ /* 'generic_file_mmap()' takes care of NOMMU case */
+ err = generic_file_mmap(file, vma);
+ if (err)
+ return err;
+ vma->vm_ops = &ubifs_file_vm_ops;
+ return 0;
+}
+
+struct address_space_operations ubifs_file_address_operations = {
+ .readpage = ubifs_readpage,
+ .writepage = ubifs_writepage,
+ .write_begin = ubifs_write_begin,
+ .write_end = ubifs_write_end,
+ .invalidatepage = ubifs_invalidatepage,
+ .set_page_dirty = ubifs_set_page_dirty,
+ .releasepage = ubifs_releasepage,
+};
+
+struct inode_operations ubifs_file_inode_operations = {
+ .setattr = ubifs_setattr,
+ .getattr = ubifs_getattr,
+#ifdef CONFIG_UBIFS_FS_XATTR
+ .setxattr = ubifs_setxattr,
+ .getxattr = ubifs_getxattr,
+ .listxattr = ubifs_listxattr,
+ .removexattr = ubifs_removexattr,
+#endif
+};
+
+struct inode_operations ubifs_symlink_inode_operations = {
+ .readlink = generic_readlink,
+ .follow_link = ubifs_follow_link,
+ .setattr = ubifs_setattr,
+ .getattr = ubifs_getattr,
+};
+
+struct file_operations ubifs_file_operations = {
+ .llseek = generic_file_llseek,
+ .read = do_sync_read,
+ .write = do_sync_write,
+ .aio_read = generic_file_aio_read,
+ .aio_write = ubifs_aio_write,
+ .mmap = ubifs_file_mmap,
+ .fsync = ubifs_fsync,
+ .unlocked_ioctl = ubifs_ioctl,
+ .splice_read = generic_file_splice_read,
+ .splice_write = generic_file_splice_write,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = ubifs_compat_ioctl,
+#endif
+};
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
new file mode 100644
index 00000000000..47814cde240
--- /dev/null
+++ b/fs/ubifs/find.c
@@ -0,0 +1,977 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ * Adrian Hunter
+ */
+
+/*
+ * This file contains functions for finding LEBs for various purposes e.g.
+ * garbage collection. In general, lprops category heaps and lists are used
+ * for fast access, falling back on scanning the LPT as a last resort.
+ */
+
+#include <linux/sort.h>
+#include "ubifs.h"
+
+/**
+ * struct scan_data - data provided to scan callback functions
+ * @min_space: minimum number of bytes for which to scan
+ * @pick_free: whether it is OK to scan for empty LEBs
+ * @lnum: LEB number found is returned here
+ * @exclude_index: whether to exclude index LEBs
+ */
+struct scan_data {
+ int min_space;
+ int pick_free;
+ int lnum;
+ int exclude_index;
+};
+
+/**
+ * valuable - determine whether LEB properties are valuable.
+ * @c: the UBIFS file-system description object
+ * @lprops: LEB properties
+ *
+ * This function return %1 if the LEB properties should be added to the LEB
+ * properties tree in memory. Otherwise %0 is returned.
+ */
+static int valuable(struct ubifs_info *c, const struct ubifs_lprops *lprops)
+{
+ int n, cat = lprops->flags & LPROPS_CAT_MASK;
+ struct ubifs_lpt_heap *heap;
+
+ switch (cat) {
+ case LPROPS_DIRTY:
+ case LPROPS_DIRTY_IDX:
+ case LPROPS_FREE:
+ heap = &c->lpt_heap[cat - 1];
+ if (heap->cnt < heap->max_cnt)
+ return 1;
+ if (lprops->free + lprops->dirty >= c->dark_wm)
+ return 1;
+ return 0;
+ case LPROPS_EMPTY:
+ n = c->lst.empty_lebs + c->freeable_cnt -
+ c->lst.taken_empty_lebs;
+ if (n < c->lsave_cnt)
+ return 1;
+ return 0;
+ case LPROPS_FREEABLE:
+ return 1;
+ case LPROPS_FRDI_IDX:
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * scan_for_dirty_cb - dirty space scan callback.
+ * @c: the UBIFS file-system description object
+ * @lprops: LEB properties to scan
+ * @in_tree: whether the LEB properties are in main memory
+ * @data: information passed to and from the caller of the scan
+ *
+ * This function returns a code that indicates whether the scan should continue
+ * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
+ * in main memory (%LPT_SCAN_ADD), or whether the scan should stop
+ * (%LPT_SCAN_STOP).
+ */
+static int scan_for_dirty_cb(struct ubifs_info *c,
+ const struct ubifs_lprops *lprops, int in_tree,
+ struct scan_data *data)
+{
+ int ret = LPT_SCAN_CONTINUE;
+
+ /* Exclude LEBs that are currently in use */
+ if (lprops->flags & LPROPS_TAKEN)
+ return LPT_SCAN_CONTINUE;
+ /* Determine whether to add these LEB properties to the tree */
+ if (!in_tree && valuable(c, lprops))
+ ret |= LPT_SCAN_ADD;
+ /* Exclude LEBs with too little space */
+ if (lprops->free + lprops->dirty < data->min_space)
+ return ret;
+ /* If specified, exclude index LEBs */
+ if (data->exclude_index && lprops->flags & LPROPS_INDEX)
+ return ret;
+ /* If specified, exclude empty or freeable LEBs */
+ if (lprops->free + lprops->dirty == c->leb_size) {
+ if (!data->pick_free)
+ return ret;
+ /* Exclude LEBs with too little dirty space (unless it is empty) */
+ } else if (lprops->dirty < c->dead_wm)
+ return ret;
+ /* Finally we found space */
+ data->lnum = lprops->lnum;
+ return LPT_SCAN_ADD | LPT_SCAN_STOP;
+}
+
+/**
+ * scan_for_dirty - find a data LEB with free space.
+ * @c: the UBIFS file-system description object
+ * @min_space: minimum amount free plus dirty space the returned LEB has to
+ * have
+ * @pick_free: if it is OK to return a free or freeable LEB
+ * @exclude_index: whether to exclude index LEBs
+ *
+ * This function returns a pointer to the LEB properties found or a negative
+ * error code.
+ */
+static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c,
+ int min_space, int pick_free,
+ int exclude_index)
+{
+ const struct ubifs_lprops *lprops;
+ struct ubifs_lpt_heap *heap;
+ struct scan_data data;
+ int err, i;
+
+ /* There may be an LEB with enough dirty space on the free heap */
+ heap = &c->lpt_heap[LPROPS_FREE - 1];
+ for (i = 0; i < heap->cnt; i++) {
+ lprops = heap->arr[i];
+ if (lprops->free + lprops->dirty < min_space)
+ continue;
+ if (lprops->dirty < c->dead_wm)
+ continue;
+ return lprops;
+ }
+ /*
+ * A LEB may have fallen off of the bottom of the dirty heap, and ended
+ * up as uncategorized even though it has enough dirty space for us now,
+ * so check the uncategorized list. N.B. neither empty nor freeable LEBs
+ * can end up as uncategorized because they are kept on lists not
+ * finite-sized heaps.
+ */
+ list_for_each_entry(lprops, &c->uncat_list, list) {
+ if (lprops->flags & LPROPS_TAKEN)
+ continue;
+ if (lprops->free + lprops->dirty < min_space)
+ continue;
+ if (exclude_index && (lprops->flags & LPROPS_INDEX))
+ continue;
+ if (lprops->dirty < c->dead_wm)
+ continue;
+ return lprops;
+ }
+ /* We have looked everywhere in main memory, now scan the flash */
+ if (c->pnodes_have >= c->pnode_cnt)
+ /* All pnodes are in memory, so skip scan */
+ return ERR_PTR(-ENOSPC);
+ data.min_space = min_space;
+ data.pick_free = pick_free;
+ data.lnum = -1;
+ data.exclude_index = exclude_index;
+ err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
+ (ubifs_lpt_scan_callback)scan_for_dirty_cb,
+ &data);
+ if (err)
+ return ERR_PTR(err);
+ ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt);
+ c->lscan_lnum = data.lnum;
+ lprops = ubifs_lpt_lookup_dirty(c, data.lnum);
+ if (IS_ERR(lprops))
+ return lprops;
+ ubifs_assert(lprops->lnum == data.lnum);
+ ubifs_assert(lprops->free + lprops->dirty >= min_space);
+ ubifs_assert(lprops->dirty >= c->dead_wm ||
+ (pick_free &&
+ lprops->free + lprops->dirty == c->leb_size));
+ ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+ ubifs_assert(!exclude_index || !(lprops->flags & LPROPS_INDEX));
+ return lprops;
+}
+
+/**
+ * ubifs_find_dirty_leb - find a dirty LEB for the Garbage Collector.
+ * @c: the UBIFS file-system description object
+ * @ret_lp: LEB properties are returned here on exit
+ * @min_space: minimum amount free plus dirty space the returned LEB has to
+ * have
+ * @pick_free: controls whether it is OK to pick empty or index LEBs
+ *
+ * This function tries to find a dirty logical eraseblock which has at least
+ * @min_space free and dirty space. It prefers to take an LEB from the dirty or
+ * dirty index heap, and it falls-back to LPT scanning if the heaps are empty
+ * or do not have an LEB which satisfies the @min_space criteria.
+ *
+ * Note, LEBs which have less than dead watermark of free + dirty space are
+ * never picked by this function.
+ *
+ * The additional @pick_free argument controls if this function has to return a
+ * free or freeable LEB if one is present. For example, GC must to set it to %1,
+ * when called from the journal space reservation function, because the
+ * appearance of free space may coincide with the loss of enough dirty space
+ * for GC to succeed anyway.
+ *
+ * In contrast, if the Garbage Collector is called from budgeting, it should
+ * just make free space, not return LEBs which are already free or freeable.
+ *
+ * In addition @pick_free is set to %2 by the recovery process in order to
+ * recover gc_lnum in which case an index LEB must not be returned.
+ *
+ * This function returns zero and the LEB properties of found dirty LEB in case
+ * of success, %-ENOSPC if no dirty LEB was found and a negative error code in
+ * case of other failures. The returned LEB is marked as "taken".
+ */
+int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
+ int min_space, int pick_free)
+{
+ int err = 0, sum, exclude_index = pick_free == 2 ? 1 : 0;
+ const struct ubifs_lprops *lp = NULL, *idx_lp = NULL;
+ struct ubifs_lpt_heap *heap, *idx_heap;
+
+ ubifs_get_lprops(c);
+
+ if (pick_free) {
+ int lebs, rsvd_idx_lebs = 0;
+
+ spin_lock(&c->space_lock);
+ lebs = c->lst.empty_lebs + c->idx_gc_cnt;
+ lebs += c->freeable_cnt - c->lst.taken_empty_lebs;
+
+ /*
+ * Note, the index may consume more LEBs than have been reserved
+ * for it. It is OK because it might be consolidated by GC.
+ * But if the index takes fewer LEBs than it is reserved for it,
+ * this function must avoid picking those reserved LEBs.
+ */
+ if (c->min_idx_lebs >= c->lst.idx_lebs) {
+ rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs;
+ exclude_index = 1;
+ }
+ spin_unlock(&c->space_lock);
+
+ /* Check if there are enough free LEBs for the index */
+ if (rsvd_idx_lebs < lebs) {
+ /* OK, try to find an empty LEB */
+ lp = ubifs_fast_find_empty(c);
+ if (lp)
+ goto found;
+
+ /* Or a freeable LEB */
+ lp = ubifs_fast_find_freeable(c);
+ if (lp)
+ goto found;
+ } else
+ /*
+ * We cannot pick free/freeable LEBs in the below code.
+ */
+ pick_free = 0;
+ } else {
+ spin_lock(&c->space_lock);
+ exclude_index = (c->min_idx_lebs >= c->lst.idx_lebs);
+ spin_unlock(&c->space_lock);
+ }
+
+ /* Look on the dirty and dirty index heaps */
+ heap = &c->lpt_heap[LPROPS_DIRTY - 1];
+ idx_heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
+
+ if (idx_heap->cnt && !exclude_index) {
+ idx_lp = idx_heap->arr[0];
+ sum = idx_lp->free + idx_lp->dirty;
+ /*
+ * Since we reserve thrice as much space for the index than it
+ * actually takes, it does not make sense to pick indexing LEBs
+ * with less than, say, half LEB of dirty space. May be half is
+ * not the optimal boundary - this should be tested and
+ * checked. This boundary should determine how much we use
+ * in-the-gaps to consolidate the index comparing to how much
+ * we use garbage collector to consolidate it. The "half"
+ * criteria just feels to be fine.
+ */
+ if (sum < min_space || sum < c->half_leb_size)
+ idx_lp = NULL;
+ }
+
+ if (heap->cnt) {
+ lp = heap->arr[0];
+ if (lp->dirty + lp->free < min_space)
+ lp = NULL;
+ }
+
+ /* Pick the LEB with most space */
+ if (idx_lp && lp) {
+ if (idx_lp->free + idx_lp->dirty >= lp->free + lp->dirty)
+ lp = idx_lp;
+ } else if (idx_lp && !lp)
+ lp = idx_lp;
+
+ if (lp) {
+ ubifs_assert(lp->free + lp->dirty >= c->dead_wm);
+ goto found;
+ }
+
+ /* Did not find a dirty LEB on the dirty heaps, have to scan */
+ dbg_find("scanning LPT for a dirty LEB");
+ lp = scan_for_dirty(c, min_space, pick_free, exclude_index);
+ if (IS_ERR(lp)) {
+ err = PTR_ERR(lp);
+ goto out;
+ }
+ ubifs_assert(lp->dirty >= c->dead_wm ||
+ (pick_free && lp->free + lp->dirty == c->leb_size));
+
+found:
+ dbg_find("found LEB %d, free %d, dirty %d, flags %#x",
+ lp->lnum, lp->free, lp->dirty, lp->flags);
+
+ lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC,
+ lp->flags | LPROPS_TAKEN, 0);
+ if (IS_ERR(lp)) {
+ err = PTR_ERR(lp);
+ goto out;
+ }
+
+ memcpy(ret_lp, lp, sizeof(struct ubifs_lprops));
+
+out:
+ ubifs_release_lprops(c);
+ return err;
+}
+
+/**
+ * scan_for_free_cb - free space scan callback.
+ * @c: the UBIFS file-system description object
+ * @lprops: LEB properties to scan
+ * @in_tree: whether the LEB properties are in main memory
+ * @data: information passed to and from the caller of the scan
+ *
+ * This function returns a code that indicates whether the scan should continue
+ * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
+ * in main memory (%LPT_SCAN_ADD), or whether the scan should stop
+ * (%LPT_SCAN_STOP).
+ */
+static int scan_for_free_cb(struct ubifs_info *c,
+ const struct ubifs_lprops *lprops, int in_tree,
+ struct scan_data *data)
+{
+ int ret = LPT_SCAN_CONTINUE;
+
+ /* Exclude LEBs that are currently in use */
+ if (lprops->flags & LPROPS_TAKEN)
+ return LPT_SCAN_CONTINUE;
+ /* Determine whether to add these LEB properties to the tree */
+ if (!in_tree && valuable(c, lprops))
+ ret |= LPT_SCAN_ADD;
+ /* Exclude index LEBs */
+ if (lprops->flags & LPROPS_INDEX)
+ return ret;
+ /* Exclude LEBs with too little space */
+ if (lprops->free < data->min_space)
+ return ret;
+ /* If specified, exclude empty LEBs */
+ if (!data->pick_free && lprops->free == c->leb_size)
+ return ret;
+ /*
+ * LEBs that have only free and dirty space must not be allocated
+ * because they may have been unmapped already or they may have data
+ * that is obsolete only because of nodes that are still sitting in a
+ * wbuf.
+ */
+ if (lprops->free + lprops->dirty == c->leb_size && lprops->dirty > 0)
+ return ret;
+ /* Finally we found space */
+ data->lnum = lprops->lnum;
+ return LPT_SCAN_ADD | LPT_SCAN_STOP;
+}
+
+/**
+ * do_find_free_space - find a data LEB with free space.
+ * @c: the UBIFS file-system description object
+ * @min_space: minimum amount of free space required
+ * @pick_free: whether it is OK to scan for empty LEBs
+ * @squeeze: whether to try to find space in a non-empty LEB first
+ *
+ * This function returns a pointer to the LEB properties found or a negative
+ * error code.
+ */
+static
+const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c,
+ int min_space, int pick_free,
+ int squeeze)
+{
+ const struct ubifs_lprops *lprops;
+ struct ubifs_lpt_heap *heap;
+ struct scan_data data;
+ int err, i;
+
+ if (squeeze) {
+ lprops = ubifs_fast_find_free(c);
+ if (lprops && lprops->free >= min_space)
+ return lprops;
+ }
+ if (pick_free) {
+ lprops = ubifs_fast_find_empty(c);
+ if (lprops)
+ return lprops;
+ }
+ if (!squeeze) {
+ lprops = ubifs_fast_find_free(c);
+ if (lprops && lprops->free >= min_space)
+ return lprops;
+ }
+ /* There may be an LEB with enough free space on the dirty heap */
+ heap = &c->lpt_heap[LPROPS_DIRTY - 1];
+ for (i = 0; i < heap->cnt; i++) {
+ lprops = heap->arr[i];
+ if (lprops->free >= min_space)
+ return lprops;
+ }
+ /*
+ * A LEB may have fallen off of the bottom of the free heap, and ended
+ * up as uncategorized even though it has enough free space for us now,
+ * so check the uncategorized list. N.B. neither empty nor freeable LEBs
+ * can end up as uncategorized because they are kept on lists not
+ * finite-sized heaps.
+ */
+ list_for_each_entry(lprops, &c->uncat_list, list) {
+ if (lprops->flags & LPROPS_TAKEN)
+ continue;
+ if (lprops->flags & LPROPS_INDEX)
+ continue;
+ if (lprops->free >= min_space)
+ return lprops;
+ }
+ /* We have looked everywhere in main memory, now scan the flash */
+ if (c->pnodes_have >= c->pnode_cnt)
+ /* All pnodes are in memory, so skip scan */
+ return ERR_PTR(-ENOSPC);
+ data.min_space = min_space;
+ data.pick_free = pick_free;
+ data.lnum = -1;
+ err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
+ (ubifs_lpt_scan_callback)scan_for_free_cb,
+ &data);
+ if (err)
+ return ERR_PTR(err);
+ ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt);
+ c->lscan_lnum = data.lnum;
+ lprops = ubifs_lpt_lookup_dirty(c, data.lnum);
+ if (IS_ERR(lprops))
+ return lprops;
+ ubifs_assert(lprops->lnum == data.lnum);
+ ubifs_assert(lprops->free >= min_space);
+ ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+ ubifs_assert(!(lprops->flags & LPROPS_INDEX));
+ return lprops;
+}
+
+/**
+ * ubifs_find_free_space - find a data LEB with free space.
+ * @c: the UBIFS file-system description object
+ * @min_space: minimum amount of required free space
+ * @free: contains amount of free space in the LEB on exit
+ * @squeeze: whether to try to find space in a non-empty LEB first
+ *
+ * This function looks for an LEB with at least @min_space bytes of free space.
+ * It tries to find an empty LEB if possible. If no empty LEBs are available,
+ * this function searches for a non-empty data LEB. The returned LEB is marked
+ * as "taken".
+ *
+ * This function returns found LEB number in case of success, %-ENOSPC if it
+ * failed to find a LEB with @min_space bytes of free space and other a negative
+ * error codes in case of failure.
+ */
+int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
+ int squeeze)
+{
+ const struct ubifs_lprops *lprops;
+ int lebs, rsvd_idx_lebs, pick_free = 0, err, lnum, flags;
+
+ dbg_find("min_space %d", min_space);
+ ubifs_get_lprops(c);
+
+ /* Check if there are enough empty LEBs for commit */
+ spin_lock(&c->space_lock);
+ if (c->min_idx_lebs > c->lst.idx_lebs)
+ rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs;
+ else
+ rsvd_idx_lebs = 0;
+ lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
+ c->lst.taken_empty_lebs;
+ if (rsvd_idx_lebs < lebs)
+ /*
+ * OK to allocate an empty LEB, but we still don't want to go
+ * looking for one if there aren't any.
+ */
+ if (c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) {
+ pick_free = 1;
+ /*
+ * Because we release the space lock, we must account
+ * for this allocation here. After the LEB properties
+ * flags have been updated, we subtract one. Note, the
+ * result of this is that lprops also decreases
+ * @taken_empty_lebs in 'ubifs_change_lp()', so it is
+ * off by one for a short period of time which may
+ * introduce a small disturbance to budgeting
+ * calculations, but this is harmless because at the
+ * worst case this would make the budgeting subsystem
+ * be more pessimistic than needed.
+ *
+ * Fundamentally, this is about serialization of the
+ * budgeting and lprops subsystems. We could make the
+ * @space_lock a mutex and avoid dropping it before
+ * calling 'ubifs_change_lp()', but mutex is more
+ * heavy-weight, and we want budgeting to be as fast as
+ * possible.
+ */
+ c->lst.taken_empty_lebs += 1;
+ }
+ spin_unlock(&c->space_lock);
+
+ lprops = do_find_free_space(c, min_space, pick_free, squeeze);
+ if (IS_ERR(lprops)) {
+ err = PTR_ERR(lprops);
+ goto out;
+ }
+
+ lnum = lprops->lnum;
+ flags = lprops->flags | LPROPS_TAKEN;
+
+ lprops = ubifs_change_lp(c, lprops, LPROPS_NC, LPROPS_NC, flags, 0);
+ if (IS_ERR(lprops)) {
+ err = PTR_ERR(lprops);
+ goto out;
+ }
+
+ if (pick_free) {
+ spin_lock(&c->space_lock);
+ c->lst.taken_empty_lebs -= 1;
+ spin_unlock(&c->space_lock);
+ }
+
+ *free = lprops->free;
+ ubifs_release_lprops(c);
+
+ if (*free == c->leb_size) {
+ /*
+ * Ensure that empty LEBs have been unmapped. They may not have
+ * been, for example, because of an unclean unmount. Also
+ * LEBs that were freeable LEBs (free + dirty == leb_size) will
+ * not have been unmapped.
+ */
+ err = ubifs_leb_unmap(c, lnum);
+ if (err)
+ return err;
+ }
+
+ dbg_find("found LEB %d, free %d", lnum, *free);
+ ubifs_assert(*free >= min_space);
+ return lnum;
+
+out:
+ if (pick_free) {
+ spin_lock(&c->space_lock);
+ c->lst.taken_empty_lebs -= 1;
+ spin_unlock(&c->space_lock);
+ }
+ ubifs_release_lprops(c);
+ return err;
+}
+
+/**
+ * scan_for_idx_cb - callback used by the scan for a free LEB for the index.
+ * @c: the UBIFS file-system description object
+ * @lprops: LEB properties to scan
+ * @in_tree: whether the LEB properties are in main memory
+ * @data: information passed to and from the caller of the scan
+ *
+ * This function returns a code that indicates whether the scan should continue
+ * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
+ * in main memory (%LPT_SCAN_ADD), or whether the scan should stop
+ * (%LPT_SCAN_STOP).
+ */
+static int scan_for_idx_cb(struct ubifs_info *c,
+ const struct ubifs_lprops *lprops, int in_tree,
+ struct scan_data *data)
+{
+ int ret = LPT_SCAN_CONTINUE;
+
+ /* Exclude LEBs that are currently in use */
+ if (lprops->flags & LPROPS_TAKEN)
+ return LPT_SCAN_CONTINUE;
+ /* Determine whether to add these LEB properties to the tree */
+ if (!in_tree && valuable(c, lprops))
+ ret |= LPT_SCAN_ADD;
+ /* Exclude index LEBS */
+ if (lprops->flags & LPROPS_INDEX)
+ return ret;
+ /* Exclude LEBs that cannot be made empty */
+ if (lprops->free + lprops->dirty != c->leb_size)
+ return ret;
+ /*
+ * We are allocating for the index so it is safe to allocate LEBs with
+ * only free and dirty space, because write buffers are sync'd at commit
+ * start.
+ */
+ data->lnum = lprops->lnum;
+ return LPT_SCAN_ADD | LPT_SCAN_STOP;
+}
+
+/**
+ * scan_for_leb_for_idx - scan for a free LEB for the index.
+ * @c: the UBIFS file-system description object
+ */
+static const struct ubifs_lprops *scan_for_leb_for_idx(struct ubifs_info *c)
+{
+ struct ubifs_lprops *lprops;
+ struct scan_data data;
+ int err;
+
+ data.lnum = -1;
+ err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
+ (ubifs_lpt_scan_callback)scan_for_idx_cb,
+ &data);
+ if (err)
+ return ERR_PTR(err);
+ ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt);
+ c->lscan_lnum = data.lnum;
+ lprops = ubifs_lpt_lookup_dirty(c, data.lnum);
+ if (IS_ERR(lprops))
+ return lprops;
+ ubifs_assert(lprops->lnum == data.lnum);
+ ubifs_assert(lprops->free + lprops->dirty == c->leb_size);
+ ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+ ubifs_assert(!(lprops->flags & LPROPS_INDEX));
+ return lprops;
+}
+
+/**
+ * ubifs_find_free_leb_for_idx - find a free LEB for the index.
+ * @c: the UBIFS file-system description object
+ *
+ * This function looks for a free LEB and returns that LEB number. The returned
+ * LEB is marked as "taken", "index".
+ *
+ * Only empty LEBs are allocated. This is for two reasons. First, the commit
+ * calculates the number of LEBs to allocate based on the assumption that they
+ * will be empty. Secondly, free space at the end of an index LEB is not
+ * guaranteed to be empty because it may have been used by the in-the-gaps
+ * method prior to an unclean unmount.
+ *
+ * If no LEB is found %-ENOSPC is returned. For other failures another negative
+ * error code is returned.
+ */
+int ubifs_find_free_leb_for_idx(struct ubifs_info *c)
+{
+ const struct ubifs_lprops *lprops;
+ int lnum = -1, err, flags;
+
+ ubifs_get_lprops(c);
+
+ lprops = ubifs_fast_find_empty(c);
+ if (!lprops) {
+ lprops = ubifs_fast_find_freeable(c);
+ if (!lprops) {
+ ubifs_assert(c->freeable_cnt == 0);
+ if (c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) {
+ lprops = scan_for_leb_for_idx(c);
+ if (IS_ERR(lprops)) {
+ err = PTR_ERR(lprops);
+ goto out;
+ }
+ }
+ }
+ }
+
+ if (!lprops) {
+ err = -ENOSPC;
+ goto out;
+ }
+
+ lnum = lprops->lnum;
+
+ dbg_find("found LEB %d, free %d, dirty %d, flags %#x",
+ lnum, lprops->free, lprops->dirty, lprops->flags);
+
+ flags = lprops->flags | LPROPS_TAKEN | LPROPS_INDEX;
+ lprops = ubifs_change_lp(c, lprops, c->leb_size, 0, flags, 0);
+ if (IS_ERR(lprops)) {
+ err = PTR_ERR(lprops);
+ goto out;
+ }
+
+ ubifs_release_lprops(c);
+
+ /*
+ * Ensure that empty LEBs have been unmapped. They may not have been,
+ * for example, because of an unclean unmount. Also LEBs that were
+ * freeable LEBs (free + dirty == leb_size) will not have been unmapped.
+ */
+ err = ubifs_leb_unmap(c, lnum);
+ if (err) {
+ ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
+ LPROPS_TAKEN | LPROPS_INDEX, 0);
+ return err;
+ }
+
+ return lnum;
+
+out:
+ ubifs_release_lprops(c);
+ return err;
+}
+
+static int cmp_dirty_idx(const struct ubifs_lprops **a,
+ const struct ubifs_lprops **b)
+{
+ const struct ubifs_lprops *lpa = *a;
+ const struct ubifs_lprops *lpb = *b;
+
+ return lpa->dirty + lpa->free - lpb->dirty - lpb->free;
+}
+
+static void swap_dirty_idx(struct ubifs_lprops **a, struct ubifs_lprops **b,
+ int size)
+{
+ struct ubifs_lprops *t = *a;
+
+ *a = *b;
+ *b = t;
+}
+
+/**
+ * ubifs_save_dirty_idx_lnums - save an array of the most dirty index LEB nos.
+ * @c: the UBIFS file-system description object
+ *
+ * This function is called each commit to create an array of LEB numbers of
+ * dirty index LEBs sorted in order of dirty and free space. This is used by
+ * the in-the-gaps method of TNC commit.
+ */
+int ubifs_save_dirty_idx_lnums(struct ubifs_info *c)
+{
+ int i;
+
+ ubifs_get_lprops(c);
+ /* Copy the LPROPS_DIRTY_IDX heap */
+ c->dirty_idx.cnt = c->lpt_heap[LPROPS_DIRTY_IDX - 1].cnt;
+ memcpy(c->dirty_idx.arr, c->lpt_heap[LPROPS_DIRTY_IDX - 1].arr,
+ sizeof(void *) * c->dirty_idx.cnt);
+ /* Sort it so that the dirtiest is now at the end */
+ sort(c->dirty_idx.arr, c->dirty_idx.cnt, sizeof(void *),
+ (int (*)(const void *, const void *))cmp_dirty_idx,
+ (void (*)(void *, void *, int))swap_dirty_idx);
+ dbg_find("found %d dirty index LEBs", c->dirty_idx.cnt);
+ if (c->dirty_idx.cnt)
+ dbg_find("dirtiest index LEB is %d with dirty %d and free %d",
+ c->dirty_idx.arr[c->dirty_idx.cnt - 1]->lnum,
+ c->dirty_idx.arr[c->dirty_idx.cnt - 1]->dirty,
+ c->dirty_idx.arr[c->dirty_idx.cnt - 1]->free);
+ /* Replace the lprops pointers with LEB numbers */
+ for (i = 0; i < c->dirty_idx.cnt; i++)
+ c->dirty_idx.arr[i] = (void *)(size_t)c->dirty_idx.arr[i]->lnum;
+ ubifs_release_lprops(c);
+ return 0;
+}
+
+/**
+ * scan_dirty_idx_cb - callback used by the scan for a dirty index LEB.
+ * @c: the UBIFS file-system description object
+ * @lprops: LEB properties to scan
+ * @in_tree: whether the LEB properties are in main memory
+ * @data: information passed to and from the caller of the scan
+ *
+ * This function returns a code that indicates whether the scan should continue
+ * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
+ * in main memory (%LPT_SCAN_ADD), or whether the scan should stop
+ * (%LPT_SCAN_STOP).
+ */
+static int scan_dirty_idx_cb(struct ubifs_info *c,
+ const struct ubifs_lprops *lprops, int in_tree,
+ struct scan_data *data)
+{
+ int ret = LPT_SCAN_CONTINUE;
+
+ /* Exclude LEBs that are currently in use */
+ if (lprops->flags & LPROPS_TAKEN)
+ return LPT_SCAN_CONTINUE;
+ /* Determine whether to add these LEB properties to the tree */
+ if (!in_tree && valuable(c, lprops))
+ ret |= LPT_SCAN_ADD;
+ /* Exclude non-index LEBs */
+ if (!(lprops->flags & LPROPS_INDEX))
+ return ret;
+ /* Exclude LEBs with too little space */
+ if (lprops->free + lprops->dirty < c->min_idx_node_sz)
+ return ret;
+ /* Finally we found space */
+ data->lnum = lprops->lnum;
+ return LPT_SCAN_ADD | LPT_SCAN_STOP;
+}
+
+/**
+ * find_dirty_idx_leb - find a dirty index LEB.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns LEB number upon success and a negative error code upon
+ * failure. In particular, -ENOSPC is returned if a dirty index LEB is not
+ * found.
+ *
+ * Note that this function scans the entire LPT but it is called very rarely.
+ */
+static int find_dirty_idx_leb(struct ubifs_info *c)
+{
+ const struct ubifs_lprops *lprops;
+ struct ubifs_lpt_heap *heap;
+ struct scan_data data;
+ int err, i, ret;
+
+ /* Check all structures in memory first */
+ data.lnum = -1;
+ heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
+ for (i = 0; i < heap->cnt; i++) {
+ lprops = heap->arr[i];
+ ret = scan_dirty_idx_cb(c, lprops, 1, &data);
+ if (ret & LPT_SCAN_STOP)
+ goto found;
+ }
+ list_for_each_entry(lprops, &c->frdi_idx_list, list) {
+ ret = scan_dirty_idx_cb(c, lprops, 1, &data);
+ if (ret & LPT_SCAN_STOP)
+ goto found;
+ }
+ list_for_each_entry(lprops, &c->uncat_list, list) {
+ ret = scan_dirty_idx_cb(c, lprops, 1, &data);
+ if (ret & LPT_SCAN_STOP)
+ goto found;
+ }
+ if (c->pnodes_have >= c->pnode_cnt)
+ /* All pnodes are in memory, so skip scan */
+ return -ENOSPC;
+ err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
+ (ubifs_lpt_scan_callback)scan_dirty_idx_cb,
+ &data);
+ if (err)
+ return err;
+found:
+ ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt);
+ c->lscan_lnum = data.lnum;
+ lprops = ubifs_lpt_lookup_dirty(c, data.lnum);
+ if (IS_ERR(lprops))
+ return PTR_ERR(lprops);
+ ubifs_assert(lprops->lnum == data.lnum);
+ ubifs_assert(lprops->free + lprops->dirty >= c->min_idx_node_sz);
+ ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+ ubifs_assert((lprops->flags & LPROPS_INDEX));
+
+ dbg_find("found dirty LEB %d, free %d, dirty %d, flags %#x",
+ lprops->lnum, lprops->free, lprops->dirty, lprops->flags);
+
+ lprops = ubifs_change_lp(c, lprops, LPROPS_NC, LPROPS_NC,
+ lprops->flags | LPROPS_TAKEN, 0);
+ if (IS_ERR(lprops))
+ return PTR_ERR(lprops);
+
+ return lprops->lnum;
+}
+
+/**
+ * get_idx_gc_leb - try to get a LEB number from trivial GC.
+ * @c: the UBIFS file-system description object
+ */
+static int get_idx_gc_leb(struct ubifs_info *c)
+{
+ const struct ubifs_lprops *lp;
+ int err, lnum;
+
+ err = ubifs_get_idx_gc_leb(c);
+ if (err < 0)
+ return err;
+ lnum = err;
+ /*
+ * The LEB was due to be unmapped after the commit but
+ * it is needed now for this commit.
+ */
+ lp = ubifs_lpt_lookup_dirty(c, lnum);
+ if (unlikely(IS_ERR(lp)))
+ return PTR_ERR(lp);
+ lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC,
+ lp->flags | LPROPS_INDEX, -1);
+ if (unlikely(IS_ERR(lp)))
+ return PTR_ERR(lp);
+ dbg_find("LEB %d, dirty %d and free %d flags %#x",
+ lp->lnum, lp->dirty, lp->free, lp->flags);
+ return lnum;
+}
+
+/**
+ * find_dirtiest_idx_leb - find dirtiest index LEB from dirtiest array.
+ * @c: the UBIFS file-system description object
+ */
+static int find_dirtiest_idx_leb(struct ubifs_info *c)
+{
+ const struct ubifs_lprops *lp;
+ int lnum;
+
+ while (1) {
+ if (!c->dirty_idx.cnt)
+ return -ENOSPC;
+ /* The lprops pointers were replaced by LEB numbers */
+ lnum = (size_t)c->dirty_idx.arr[--c->dirty_idx.cnt];
+ lp = ubifs_lpt_lookup(c, lnum);
+ if (IS_ERR(lp))
+ return PTR_ERR(lp);
+ if ((lp->flags & LPROPS_TAKEN) || !(lp->flags & LPROPS_INDEX))
+ continue;
+ lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC,
+ lp->flags | LPROPS_TAKEN, 0);
+ if (IS_ERR(lp))
+ return PTR_ERR(lp);
+ break;
+ }
+ dbg_find("LEB %d, dirty %d and free %d flags %#x", lp->lnum, lp->dirty,
+ lp->free, lp->flags);
+ ubifs_assert(lp->flags | LPROPS_TAKEN);
+ ubifs_assert(lp->flags | LPROPS_INDEX);
+ return lnum;
+}
+
+/**
+ * ubifs_find_dirty_idx_leb - try to find dirtiest index LEB as at last commit.
+ * @c: the UBIFS file-system description object
+ *
+ * This function attempts to find an untaken index LEB with the most free and
+ * dirty space that can be used without overwriting index nodes that were in the
+ * last index committed.
+ */
+int ubifs_find_dirty_idx_leb(struct ubifs_info *c)
+{
+ int err;
+
+ ubifs_get_lprops(c);
+
+ /*
+ * We made an array of the dirtiest index LEB numbers as at the start of
+ * last commit. Try that array first.
+ */
+ err = find_dirtiest_idx_leb(c);
+
+ /* Next try scanning the entire LPT */
+ if (err == -ENOSPC)
+ err = find_dirty_idx_leb(c);
+
+ /* Finally take any index LEBs awaiting trivial GC */
+ if (err == -ENOSPC)
+ err = get_idx_gc_leb(c);
+
+ ubifs_release_lprops(c);
+ return err;
+}
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
new file mode 100644
index 00000000000..02aba36fe3d
--- /dev/null
+++ b/fs/ubifs/gc.c
@@ -0,0 +1,787 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ * Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements garbage collection. The procedure for garbage collection
+ * is different depending on whether a LEB as an index LEB (contains index
+ * nodes) or not. For non-index LEBs, garbage collection finds a LEB which
+ * contains a lot of dirty space (obsolete nodes), and copies the non-obsolete
+ * nodes to the journal, at which point the garbage-collected LEB is free to be
+ * reused. For index LEBs, garbage collection marks the non-obsolete index nodes
+ * dirty in the TNC, and after the next commit, the garbage-collected LEB is
+ * to be reused. Garbage collection will cause the number of dirty index nodes
+ * to grow, however sufficient space is reserved for the index to ensure the
+ * commit will never run out of space.
+ */
+
+#include <linux/pagemap.h>
+#include "ubifs.h"
+
+/*
+ * GC tries to optimize the way it fit nodes to available space, and it sorts
+ * nodes a little. The below constants are watermarks which define "large",
+ * "medium", and "small" nodes.
+ */
+#define MEDIUM_NODE_WM (UBIFS_BLOCK_SIZE / 4)
+#define SMALL_NODE_WM UBIFS_MAX_DENT_NODE_SZ
+
+/*
+ * GC may need to move more then one LEB to make progress. The below constants
+ * define "soft" and "hard" limits on the number of LEBs the garbage collector
+ * may move.
+ */
+#define SOFT_LEBS_LIMIT 4
+#define HARD_LEBS_LIMIT 32
+
+/**
+ * switch_gc_head - switch the garbage collection journal head.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to write
+ * @len: length of the buffer to write
+ * @lnum: LEB number written is returned here
+ * @offs: offset written is returned here
+ *
+ * This function switch the GC head to the next LEB which is reserved in
+ * @c->gc_lnum. Returns %0 in case of success, %-EAGAIN if commit is required,
+ * and other negative error code in case of failures.
+ */
+static int switch_gc_head(struct ubifs_info *c)
+{
+ int err, gc_lnum = c->gc_lnum;
+ struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
+
+ ubifs_assert(gc_lnum != -1);
+ dbg_gc("switch GC head from LEB %d:%d to LEB %d (waste %d bytes)",
+ wbuf->lnum, wbuf->offs + wbuf->used, gc_lnum,
+ c->leb_size - wbuf->offs - wbuf->used);
+
+ err = ubifs_wbuf_sync_nolock(wbuf);
+ if (err)
+ return err;
+
+ /*
+ * The GC write-buffer was synchronized, we may safely unmap
+ * 'c->gc_lnum'.
+ */
+ err = ubifs_leb_unmap(c, gc_lnum);
+ if (err)
+ return err;
+
+ err = ubifs_add_bud_to_log(c, GCHD, gc_lnum, 0);
+ if (err)
+ return err;
+
+ c->gc_lnum = -1;
+ err = ubifs_wbuf_seek_nolock(wbuf, gc_lnum, 0, UBI_LONGTERM);
+ return err;
+}
+
+/**
+ * move_nodes - move nodes.
+ * @c: UBIFS file-system description object
+ * @sleb: describes nodes to move
+ *
+ * This function moves valid nodes from data LEB described by @sleb to the GC
+ * journal head. The obsolete nodes are dropped.
+ *
+ * When moving nodes we have to deal with classical bin-packing problem: the
+ * space in the current GC journal head LEB and in @c->gc_lnum are the "bins",
+ * where the nodes in the @sleb->nodes list are the elements which should be
+ * fit optimally to the bins. This function uses the "first fit decreasing"
+ * strategy, although it does not really sort the nodes but just split them on
+ * 3 classes - large, medium, and small, so they are roughly sorted.
+ *
+ * This function returns zero in case of success, %-EAGAIN if commit is
+ * required, and other negative error codes in case of other failures.
+ */
+static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
+{
+ struct ubifs_scan_node *snod, *tmp;
+ struct list_head large, medium, small;
+ struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
+ int avail, err, min = INT_MAX;
+
+ INIT_LIST_HEAD(&large);
+ INIT_LIST_HEAD(&medium);
+ INIT_LIST_HEAD(&small);
+
+ list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
+ struct list_head *lst;
+
+ ubifs_assert(snod->type != UBIFS_IDX_NODE);
+ ubifs_assert(snod->type != UBIFS_REF_NODE);
+ ubifs_assert(snod->type != UBIFS_CS_NODE);
+
+ err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum,
+ snod->offs, 0);
+ if (err < 0)
+ goto out;
+
+ lst = &snod->list;
+ list_del(lst);
+ if (!err) {
+ /* The node is obsolete, remove it from the list */
+ kfree(snod);
+ continue;
+ }
+
+ /*
+ * Sort the list of nodes so that large nodes go first, and
+ * small nodes go last.
+ */
+ if (snod->len > MEDIUM_NODE_WM)
+ list_add(lst, &large);
+ else if (snod->len > SMALL_NODE_WM)
+ list_add(lst, &medium);
+ else
+ list_add(lst, &small);
+
+ /* And find the smallest node */
+ if (snod->len < min)
+ min = snod->len;
+ }
+
+ /*
+ * Join the tree lists so that we'd have one roughly sorted list
+ * ('large' will be the head of the joined list).
+ */
+ list_splice(&medium, large.prev);
+ list_splice(&small, large.prev);
+
+ if (wbuf->lnum == -1) {
+ /*
+ * The GC journal head is not set, because it is the first GC
+ * invocation since mount.
+ */
+ err = switch_gc_head(c);
+ if (err)
+ goto out;
+ }
+
+ /* Write nodes to their new location. Use the first-fit strategy */
+ while (1) {
+ avail = c->leb_size - wbuf->offs - wbuf->used;
+ list_for_each_entry_safe(snod, tmp, &large, list) {
+ int new_lnum, new_offs;
+
+ if (avail < min)
+ break;
+
+ if (snod->len > avail)
+ /* This node does not fit */
+ continue;
+
+ cond_resched();
+
+ new_lnum = wbuf->lnum;
+ new_offs = wbuf->offs + wbuf->used;
+ err = ubifs_wbuf_write_nolock(wbuf, snod->node,
+ snod->len);
+ if (err)
+ goto out;
+ err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
+ snod->offs, new_lnum, new_offs,
+ snod->len);
+ if (err)
+ goto out;
+
+ avail = c->leb_size - wbuf->offs - wbuf->used;
+ list_del(&snod->list);
+ kfree(snod);
+ }
+
+ if (list_empty(&large))
+ break;
+
+ /*
+ * Waste the rest of the space in the LEB and switch to the
+ * next LEB.
+ */
+ err = switch_gc_head(c);
+ if (err)
+ goto out;
+ }
+
+ return 0;
+
+out:
+ list_for_each_entry_safe(snod, tmp, &large, list) {
+ list_del(&snod->list);
+ kfree(snod);
+ }
+ return err;
+}
+
+/**
+ * gc_sync_wbufs - sync write-buffers for GC.
+ * @c: UBIFS file-system description object
+ *
+ * We must guarantee that obsoleting nodes are on flash. Unfortunately they may
+ * be in a write-buffer instead. That is, a node could be written to a
+ * write-buffer, obsoleting another node in a LEB that is GC'd. If that LEB is
+ * erased before the write-buffer is sync'd and then there is an unclean
+ * unmount, then an existing node is lost. To avoid this, we sync all
+ * write-buffers.
+ *
+ * This function returns %0 on success or a negative error code on failure.
+ */
+static int gc_sync_wbufs(struct ubifs_info *c)
+{
+ int err, i;
+
+ for (i = 0; i < c->jhead_cnt; i++) {
+ if (i == GCHD)
+ continue;
+ err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+/**
+ * ubifs_garbage_collect_leb - garbage-collect a logical eraseblock.
+ * @c: UBIFS file-system description object
+ * @lp: describes the LEB to garbage collect
+ *
+ * This function garbage-collects an LEB and returns one of the @LEB_FREED,
+ * @LEB_RETAINED, etc positive codes in case of success, %-EAGAIN if commit is
+ * required, and other negative error codes in case of failures.
+ */
+int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
+{
+ struct ubifs_scan_leb *sleb;
+ struct ubifs_scan_node *snod;
+ struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
+ int err = 0, lnum = lp->lnum;
+
+ ubifs_assert(c->gc_lnum != -1 || wbuf->offs + wbuf->used == 0 ||
+ c->need_recovery);
+ ubifs_assert(c->gc_lnum != lnum);
+ ubifs_assert(wbuf->lnum != lnum);
+
+ /*
+ * We scan the entire LEB even though we only really need to scan up to
+ * (c->leb_size - lp->free).
+ */
+ sleb = ubifs_scan(c, lnum, 0, c->sbuf);
+ if (IS_ERR(sleb))
+ return PTR_ERR(sleb);
+
+ ubifs_assert(!list_empty(&sleb->nodes));
+ snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list);
+
+ if (snod->type == UBIFS_IDX_NODE) {
+ struct ubifs_gced_idx_leb *idx_gc;
+
+ dbg_gc("indexing LEB %d (free %d, dirty %d)",
+ lnum, lp->free, lp->dirty);
+ list_for_each_entry(snod, &sleb->nodes, list) {
+ struct ubifs_idx_node *idx = snod->node;
+ int level = le16_to_cpu(idx->level);
+
+ ubifs_assert(snod->type == UBIFS_IDX_NODE);
+ key_read(c, ubifs_idx_key(c, idx), &snod->key);
+ err = ubifs_dirty_idx_node(c, &snod->key, level, lnum,
+ snod->offs);
+ if (err)
+ goto out;
+ }
+
+ idx_gc = kmalloc(sizeof(struct ubifs_gced_idx_leb), GFP_NOFS);
+ if (!idx_gc) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ idx_gc->lnum = lnum;
+ idx_gc->unmap = 0;
+ list_add(&idx_gc->list, &c->idx_gc);
+
+ /*
+ * Don't release the LEB until after the next commit, because
+ * it may contain date which is needed for recovery. So
+ * although we freed this LEB, it will become usable only after
+ * the commit.
+ */
+ err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0,
+ LPROPS_INDEX, 1);
+ if (err)
+ goto out;
+ err = LEB_FREED_IDX;
+ } else {
+ dbg_gc("data LEB %d (free %d, dirty %d)",
+ lnum, lp->free, lp->dirty);
+
+ err = move_nodes(c, sleb);
+ if (err)
+ goto out_inc_seq;
+
+ err = gc_sync_wbufs(c);
+ if (err)
+ goto out_inc_seq;
+
+ err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0, 0, 0);
+ if (err)
+ goto out_inc_seq;
+
+ /* Allow for races with TNC */
+ c->gced_lnum = lnum;
+ smp_wmb();
+ c->gc_seq += 1;
+ smp_wmb();
+
+ if (c->gc_lnum == -1) {
+ c->gc_lnum = lnum;
+ err = LEB_RETAINED;
+ } else {
+ err = ubifs_wbuf_sync_nolock(wbuf);
+ if (err)
+ goto out;
+
+ err = ubifs_leb_unmap(c, lnum);
+ if (err)
+ goto out;
+
+ err = LEB_FREED;
+ }
+ }
+
+out:
+ ubifs_scan_destroy(sleb);
+ return err;
+
+out_inc_seq:
+ /* We may have moved at least some nodes so allow for races with TNC */
+ c->gced_lnum = lnum;
+ smp_wmb();
+ c->gc_seq += 1;
+ smp_wmb();
+ goto out;
+}
+
+/**
+ * ubifs_garbage_collect - UBIFS garbage collector.
+ * @c: UBIFS file-system description object
+ * @anyway: do GC even if there are free LEBs
+ *
+ * This function does out-of-place garbage collection. The return codes are:
+ * o positive LEB number if the LEB has been freed and may be used;
+ * o %-EAGAIN if the caller has to run commit;
+ * o %-ENOSPC if GC failed to make any progress;
+ * o other negative error codes in case of other errors.
+ *
+ * Garbage collector writes data to the journal when GC'ing data LEBs, and just
+ * marking indexing nodes dirty when GC'ing indexing LEBs. Thus, at some point
+ * commit may be required. But commit cannot be run from inside GC, because the
+ * caller might be holding the commit lock, so %-EAGAIN is returned instead;
+ * And this error code means that the caller has to run commit, and re-run GC
+ * if there is still no free space.
+ *
+ * There are many reasons why this function may return %-EAGAIN:
+ * o the log is full and there is no space to write an LEB reference for
+ * @c->gc_lnum;
+ * o the journal is too large and exceeds size limitations;
+ * o GC moved indexing LEBs, but they can be used only after the commit;
+ * o the shrinker fails to find clean znodes to free and requests the commit;
+ * o etc.
+ *
+ * Note, if the file-system is close to be full, this function may return
+ * %-EAGAIN infinitely, so the caller has to limit amount of re-invocations of
+ * the function. E.g., this happens if the limits on the journal size are too
+ * tough and GC writes too much to the journal before an LEB is freed. This
+ * might also mean that the journal is too large, and the TNC becomes to big,
+ * so that the shrinker is constantly called, finds not clean znodes to free,
+ * and requests commit. Well, this may also happen if the journal is all right,
+ * but another kernel process consumes too much memory. Anyway, infinite
+ * %-EAGAIN may happen, but in some extreme/misconfiguration cases.
+ */
+int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
+{
+ int i, err, ret, min_space = c->dead_wm;
+ struct ubifs_lprops lp;
+ struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
+
+ ubifs_assert_cmt_locked(c);
+
+ if (ubifs_gc_should_commit(c))
+ return -EAGAIN;
+
+ mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+
+ if (c->ro_media) {
+ ret = -EROFS;
+ goto out_unlock;
+ }
+
+ /* We expect the write-buffer to be empty on entry */
+ ubifs_assert(!wbuf->used);
+
+ for (i = 0; ; i++) {
+ int space_before = c->leb_size - wbuf->offs - wbuf->used;
+ int space_after;
+
+ cond_resched();
+
+ /* Give the commit an opportunity to run */
+ if (ubifs_gc_should_commit(c)) {
+ ret = -EAGAIN;
+ break;
+ }
+
+ if (i > SOFT_LEBS_LIMIT && !list_empty(&c->idx_gc)) {
+ /*
+ * We've done enough iterations. Indexing LEBs were
+ * moved and will be available after the commit.
+ */
+ dbg_gc("soft limit, some index LEBs GC'ed, -EAGAIN");
+ ubifs_commit_required(c);
+ ret = -EAGAIN;
+ break;
+ }
+
+ if (i > HARD_LEBS_LIMIT) {
+ /*
+ * We've moved too many LEBs and have not made
+ * progress, give up.
+ */
+ dbg_gc("hard limit, -ENOSPC");
+ ret = -ENOSPC;
+ break;
+ }
+
+ /*
+ * Empty and freeable LEBs can turn up while we waited for
+ * the wbuf lock, or while we have been running GC. In that
+ * case, we should just return one of those instead of
+ * continuing to GC dirty LEBs. Hence we request
+ * 'ubifs_find_dirty_leb()' to return an empty LEB if it can.
+ */
+ ret = ubifs_find_dirty_leb(c, &lp, min_space, anyway ? 0 : 1);
+ if (ret) {
+ if (ret == -ENOSPC)
+ dbg_gc("no more dirty LEBs");
+ break;
+ }
+
+ dbg_gc("found LEB %d: free %d, dirty %d, sum %d "
+ "(min. space %d)", lp.lnum, lp.free, lp.dirty,
+ lp.free + lp.dirty, min_space);
+
+ if (lp.free + lp.dirty == c->leb_size) {
+ /* An empty LEB was returned */
+ dbg_gc("LEB %d is free, return it", lp.lnum);
+ /*
+ * ubifs_find_dirty_leb() doesn't return freeable index
+ * LEBs.
+ */
+ ubifs_assert(!(lp.flags & LPROPS_INDEX));
+ if (lp.free != c->leb_size) {
+ /*
+ * Write buffers must be sync'd before
+ * unmapping freeable LEBs, because one of them
+ * may contain data which obsoletes something
+ * in 'lp.pnum'.
+ */
+ ret = gc_sync_wbufs(c);
+ if (ret)
+ goto out;
+ ret = ubifs_change_one_lp(c, lp.lnum,
+ c->leb_size, 0, 0, 0,
+ 0);
+ if (ret)
+ goto out;
+ }
+ ret = ubifs_leb_unmap(c, lp.lnum);
+ if (ret)
+ goto out;
+ ret = lp.lnum;
+ break;
+ }
+
+ space_before = c->leb_size - wbuf->offs - wbuf->used;
+ if (wbuf->lnum == -1)
+ space_before = 0;
+
+ ret = ubifs_garbage_collect_leb(c, &lp);
+ if (ret < 0) {
+ if (ret == -EAGAIN || ret == -ENOSPC) {
+ /*
+ * These codes are not errors, so we have to
+ * return the LEB to lprops. But if the
+ * 'ubifs_return_leb()' function fails, its
+ * failure code is propagated to the caller
+ * instead of the original '-EAGAIN' or
+ * '-ENOSPC'.
+ */
+ err = ubifs_return_leb(c, lp.lnum);
+ if (err)
+ ret = err;
+ break;
+ }
+ goto out;
+ }
+
+ if (ret == LEB_FREED) {
+ /* An LEB has been freed and is ready for use */
+ dbg_gc("LEB %d freed, return", lp.lnum);
+ ret = lp.lnum;
+ break;
+ }
+
+ if (ret == LEB_FREED_IDX) {
+ /*
+ * This was an indexing LEB and it cannot be
+ * immediately used. And instead of requesting the
+ * commit straight away, we try to garbage collect some
+ * more.
+ */
+ dbg_gc("indexing LEB %d freed, continue", lp.lnum);
+ continue;
+ }
+
+ ubifs_assert(ret == LEB_RETAINED);
+ space_after = c->leb_size - wbuf->offs - wbuf->used;
+ dbg_gc("LEB %d retained, freed %d bytes", lp.lnum,
+ space_after - space_before);
+
+ if (space_after > space_before) {
+ /* GC makes progress, keep working */
+ min_space >>= 1;
+ if (min_space < c->dead_wm)
+ min_space = c->dead_wm;
+ continue;
+ }
+
+ dbg_gc("did not make progress");
+
+ /*
+ * GC moved an LEB bud have not done any progress. This means
+ * that the previous GC head LEB contained too few free space
+ * and the LEB which was GC'ed contained only large nodes which
+ * did not fit that space.
+ *
+ * We can do 2 things:
+ * 1. pick another LEB in a hope it'll contain a small node
+ * which will fit the space we have at the end of current GC
+ * head LEB, but there is no guarantee, so we try this out
+ * unless we have already been working for too long;
+ * 2. request an LEB with more dirty space, which will force
+ * 'ubifs_find_dirty_leb()' to start scanning the lprops
+ * table, instead of just picking one from the heap
+ * (previously it already picked the dirtiest LEB).
+ */
+ if (i < SOFT_LEBS_LIMIT) {
+ dbg_gc("try again");
+ continue;
+ }
+
+ min_space <<= 1;
+ if (min_space > c->dark_wm)
+ min_space = c->dark_wm;
+ dbg_gc("set min. space to %d", min_space);
+ }
+
+ if (ret == -ENOSPC && !list_empty(&c->idx_gc)) {
+ dbg_gc("no space, some index LEBs GC'ed, -EAGAIN");
+ ubifs_commit_required(c);
+ ret = -EAGAIN;
+ }
+
+ err = ubifs_wbuf_sync_nolock(wbuf);
+ if (!err)
+ err = ubifs_leb_unmap(c, c->gc_lnum);
+ if (err) {
+ ret = err;
+ goto out;
+ }
+out_unlock:
+ mutex_unlock(&wbuf->io_mutex);
+ return ret;
+
+out:
+ ubifs_assert(ret < 0);
+ ubifs_assert(ret != -ENOSPC && ret != -EAGAIN);
+ ubifs_ro_mode(c, ret);
+ ubifs_wbuf_sync_nolock(wbuf);
+ mutex_unlock(&wbuf->io_mutex);
+ ubifs_return_leb(c, lp.lnum);
+ return ret;
+}
+
+/**
+ * ubifs_gc_start_commit - garbage collection at start of commit.
+ * @c: UBIFS file-system description object
+ *
+ * If a LEB has only dirty and free space, then we may safely unmap it and make
+ * it free. Note, we cannot do this with indexing LEBs because dirty space may
+ * correspond index nodes that are required for recovery. In that case, the
+ * LEB cannot be unmapped until after the next commit.
+ *
+ * This function returns %0 upon success and a negative error code upon failure.
+ */
+int ubifs_gc_start_commit(struct ubifs_info *c)
+{
+ struct ubifs_gced_idx_leb *idx_gc;
+ const struct ubifs_lprops *lp;
+ int err = 0, flags;
+
+ ubifs_get_lprops(c);
+
+ /*
+ * Unmap (non-index) freeable LEBs. Note that recovery requires that all
+ * wbufs are sync'd before this, which is done in 'do_commit()'.
+ */
+ while (1) {
+ lp = ubifs_fast_find_freeable(c);
+ if (unlikely(IS_ERR(lp))) {
+ err = PTR_ERR(lp);
+ goto out;
+ }
+ if (!lp)
+ break;
+ ubifs_assert(!(lp->flags & LPROPS_TAKEN));
+ ubifs_assert(!(lp->flags & LPROPS_INDEX));
+ err = ubifs_leb_unmap(c, lp->lnum);
+ if (err)
+ goto out;
+ lp = ubifs_change_lp(c, lp, c->leb_size, 0, lp->flags, 0);
+ if (unlikely(IS_ERR(lp))) {
+ err = PTR_ERR(lp);
+ goto out;
+ }
+ ubifs_assert(!(lp->flags & LPROPS_TAKEN));
+ ubifs_assert(!(lp->flags & LPROPS_INDEX));
+ }
+
+ /* Mark GC'd index LEBs OK to unmap after this commit finishes */
+ list_for_each_entry(idx_gc, &c->idx_gc, list)
+ idx_gc->unmap = 1;
+
+ /* Record index freeable LEBs for unmapping after commit */
+ while (1) {
+ lp = ubifs_fast_find_frdi_idx(c);
+ if (unlikely(IS_ERR(lp))) {
+ err = PTR_ERR(lp);
+ goto out;
+ }
+ if (!lp)
+ break;
+ idx_gc = kmalloc(sizeof(struct ubifs_gced_idx_leb), GFP_NOFS);
+ if (!idx_gc) {
+ err = -ENOMEM;
+ goto out;
+ }
+ ubifs_assert(!(lp->flags & LPROPS_TAKEN));
+ ubifs_assert(lp->flags & LPROPS_INDEX);
+ /* Don't release the LEB until after the next commit */
+ flags = (lp->flags | LPROPS_TAKEN) ^ LPROPS_INDEX;
+ lp = ubifs_change_lp(c, lp, c->leb_size, 0, flags, 1);
+ if (unlikely(IS_ERR(lp))) {
+ err = PTR_ERR(lp);
+ kfree(idx_gc);
+ goto out;
+ }
+ ubifs_assert(lp->flags & LPROPS_TAKEN);
+ ubifs_assert(!(lp->flags & LPROPS_INDEX));
+ idx_gc->lnum = lp->lnum;
+ idx_gc->unmap = 1;
+ list_add(&idx_gc->list, &c->idx_gc);
+ }
+out:
+ ubifs_release_lprops(c);
+ return err;
+}
+
+/**
+ * ubifs_gc_end_commit - garbage collection at end of commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function completes out-of-place garbage collection of index LEBs.
+ */
+int ubifs_gc_end_commit(struct ubifs_info *c)
+{
+ struct ubifs_gced_idx_leb *idx_gc, *tmp;
+ struct ubifs_wbuf *wbuf;
+ int err = 0;
+
+ wbuf = &c->jheads[GCHD].wbuf;
+ mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+ list_for_each_entry_safe(idx_gc, tmp, &c->idx_gc, list)
+ if (idx_gc->unmap) {
+ dbg_gc("LEB %d", idx_gc->lnum);
+ err = ubifs_leb_unmap(c, idx_gc->lnum);
+ if (err)
+ goto out;
+ err = ubifs_change_one_lp(c, idx_gc->lnum, LPROPS_NC,
+ LPROPS_NC, 0, LPROPS_TAKEN, -1);
+ if (err)
+ goto out;
+ list_del(&idx_gc->list);
+ kfree(idx_gc);
+ }
+out:
+ mutex_unlock(&wbuf->io_mutex);
+ return err;
+}
+
+/**
+ * ubifs_destroy_idx_gc - destroy idx_gc list.
+ * @c: UBIFS file-system description object
+ *
+ * This function destroys the idx_gc list. It is called when unmounting or
+ * remounting read-only so locks are not needed.
+ */
+void ubifs_destroy_idx_gc(struct ubifs_info *c)
+{
+ while (!list_empty(&c->idx_gc)) {
+ struct ubifs_gced_idx_leb *idx_gc;
+
+ idx_gc = list_entry(c->idx_gc.next, struct ubifs_gced_idx_leb,
+ list);
+ c->idx_gc_cnt -= 1;
+ list_del(&idx_gc->list);
+ kfree(idx_gc);
+ }
+
+}
+
+/**
+ * ubifs_get_idx_gc_leb - get a LEB from GC'd index LEB list.
+ * @c: UBIFS file-system description object
+ *
+ * Called during start commit so locks are not needed.
+ */
+int ubifs_get_idx_gc_leb(struct ubifs_info *c)
+{
+ struct ubifs_gced_idx_leb *idx_gc;
+ int lnum;
+
+ if (list_empty(&c->idx_gc))
+ return -ENOSPC;
+ idx_gc = list_entry(c->idx_gc.next, struct ubifs_gced_idx_leb, list);
+ lnum = idx_gc->lnum;
+ /* c->idx_gc_cnt is updated by the caller when lprops are updated */
+ list_del(&idx_gc->list);
+ kfree(idx_gc);
+ return lnum;
+}
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
new file mode 100644
index 00000000000..054363f2b20
--- /dev/null
+++ b/fs/ubifs/io.c
@@ -0,0 +1,928 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ * Copyright (C) 2006, 2007 University of Szeged, Hungary
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ * Adrian Hunter
+ * Zoltan Sogor
+ */
+
+/*
+ * This file implements UBIFS I/O subsystem which provides various I/O-related
+ * helper functions (reading/writing/checking/validating nodes) and implements
+ * write-buffering support. Write buffers help to save space which otherwise
+ * would have been wasted for padding to the nearest minimal I/O unit boundary.
+ * Instead, data first goes to the write-buffer and is flushed when the
+ * buffer is full or when it is not used for some time (by timer). This is
+ * similarto the mechanism is used by JFFS2.
+ *
+ * Write-buffers are defined by 'struct ubifs_wbuf' objects and protected by
+ * mutexes defined inside these objects. Since sometimes upper-level code
+ * has to lock the write-buffer (e.g. journal space reservation code), many
+ * functions related to write-buffers have "nolock" suffix which means that the
+ * caller has to lock the write-buffer before calling this function.
+ *
+ * UBIFS stores nodes at 64 bit-aligned addresses. If the node length is not
+ * aligned, UBIFS starts the next node from the aligned address, and the padded
+ * bytes may contain any rubbish. In other words, UBIFS does not put padding
+ * bytes in those small gaps. Common headers of nodes store real node lengths,
+ * not aligned lengths. Indexing nodes also store real lengths in branches.
+ *
+ * UBIFS uses padding when it pads to the next min. I/O unit. In this case it
+ * uses padding nodes or padding bytes, if the padding node does not fit.
+ *
+ * All UBIFS nodes are protected by CRC checksums and UBIFS checks all nodes
+ * every time they are read from the flash media.
+ */
+
+#include <linux/crc32.h>
+#include "ubifs.h"
+
+/**
+ * ubifs_ro_mode - switch UBIFS to read read-only mode.
+ * @c: UBIFS file-system description object
+ * @err: error code which is the reason of switching to R/O mode
+ */
+void ubifs_ro_mode(struct ubifs_info *c, int err)
+{
+ if (!c->ro_media) {
+ c->ro_media = 1;
+ ubifs_warn("switched to read-only mode, error %d", err);
+ dbg_dump_stack();
+ }
+}
+
+/**
+ * ubifs_check_node - check node.
+ * @c: UBIFS file-system description object
+ * @buf: node to check
+ * @lnum: logical eraseblock number
+ * @offs: offset within the logical eraseblock
+ * @quiet: print no messages
+ *
+ * This function checks node magic number and CRC checksum. This function also
+ * validates node length to prevent UBIFS from becoming crazy when an attacker
+ * feeds it a file-system image with incorrect nodes. For example, too large
+ * node length in the common header could cause UBIFS to read memory outside of
+ * allocated buffer when checking the CRC checksum.
+ *
+ * This function returns zero in case of success %-EUCLEAN in case of bad CRC
+ * or magic.
+ */
+int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
+ int offs, int quiet)
+{
+ int err = -EINVAL, type, node_len;
+ uint32_t crc, node_crc, magic;
+ const struct ubifs_ch *ch = buf;
+
+ ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
+ ubifs_assert(!(offs & 7) && offs < c->leb_size);
+
+ magic = le32_to_cpu(ch->magic);
+ if (magic != UBIFS_NODE_MAGIC) {
+ if (!quiet)
+ ubifs_err("bad magic %#08x, expected %#08x",
+ magic, UBIFS_NODE_MAGIC);
+ err = -EUCLEAN;
+ goto out;
+ }
+
+ type = ch->node_type;
+ if (type < 0 || type >= UBIFS_NODE_TYPES_CNT) {
+ if (!quiet)
+ ubifs_err("bad node type %d", type);
+ goto out;
+ }
+
+ node_len = le32_to_cpu(ch->len);
+ if (node_len + offs > c->leb_size)
+ goto out_len;
+
+ if (c->ranges[type].max_len == 0) {
+ if (node_len != c->ranges[type].len)
+ goto out_len;
+ } else if (node_len < c->ranges[type].min_len ||
+ node_len > c->ranges[type].max_len)
+ goto out_len;
+
+ crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
+ node_crc = le32_to_cpu(ch->crc);
+ if (crc != node_crc) {
+ if (!quiet)
+ ubifs_err("bad CRC: calculated %#08x, read %#08x",
+ crc, node_crc);
+ err = -EUCLEAN;
+ goto out;
+ }
+
+ return 0;
+
+out_len:
+ if (!quiet)
+ ubifs_err("bad node length %d", node_len);
+out:
+ if (!quiet) {
+ ubifs_err("bad node at LEB %d:%d", lnum, offs);
+ dbg_dump_node(c, buf);
+ dbg_dump_stack();
+ }
+ return err;
+}
+
+/**
+ * ubifs_pad - pad flash space.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to put padding to
+ * @pad: how many bytes to pad
+ *
+ * The flash media obliges us to write only in chunks of %c->min_io_size and
+ * when we have to write less data we add padding node to the write-buffer and
+ * pad it to the next minimal I/O unit's boundary. Padding nodes help when the
+ * media is being scanned. If the amount of wasted space is not enough to fit a
+ * padding node which takes %UBIFS_PAD_NODE_SZ bytes, we write padding bytes
+ * pattern (%UBIFS_PADDING_BYTE).
+ *
+ * Padding nodes are also used to fill gaps when the "commit-in-gaps" method is
+ * used.
+ */
+void ubifs_pad(const struct ubifs_info *c, void *buf, int pad)
+{
+ uint32_t crc;
+
+ ubifs_assert(pad >= 0 && !(pad & 7));
+
+ if (pad >= UBIFS_PAD_NODE_SZ) {
+ struct ubifs_ch *ch = buf;
+ struct ubifs_pad_node *pad_node = buf;
+
+ ch->magic = cpu_to_le32(UBIFS_NODE_MAGIC);
+ ch->node_type = UBIFS_PAD_NODE;
+ ch->group_type = UBIFS_NO_NODE_GROUP;
+ ch->padding[0] = ch->padding[1] = 0;
+ ch->sqnum = 0;
+ ch->len = cpu_to_le32(UBIFS_PAD_NODE_SZ);
+ pad -= UBIFS_PAD_NODE_SZ;
+ pad_node->pad_len = cpu_to_le32(pad);
+ crc = crc32(UBIFS_CRC32_INIT, buf + 8, UBIFS_PAD_NODE_SZ - 8);
+ ch->crc = cpu_to_le32(crc);
+ memset(buf + UBIFS_PAD_NODE_SZ, 0, pad);
+ } else if (pad > 0)
+ /* Too little space, padding node won't fit */
+ memset(buf, UBIFS_PADDING_BYTE, pad);
+}
+
+/**
+ * next_sqnum - get next sequence number.
+ * @c: UBIFS file-system description object
+ */
+static unsigned long long next_sqnum(struct ubifs_info *c)
+{
+ unsigned long long sqnum;
+
+ spin_lock(&c->cnt_lock);
+ sqnum = ++c->max_sqnum;
+ spin_unlock(&c->cnt_lock);
+
+ if (unlikely(sqnum >= SQNUM_WARN_WATERMARK)) {
+ if (sqnum >= SQNUM_WATERMARK) {
+ ubifs_err("sequence number overflow %llu, end of life",
+ sqnum);
+ ubifs_ro_mode(c, -EINVAL);
+ }
+ ubifs_warn("running out of sequence numbers, end of life soon");
+ }
+
+ return sqnum;
+}
+
+/**
+ * ubifs_prepare_node - prepare node to be written to flash.
+ * @c: UBIFS file-system description object
+ * @node: the node to pad
+ * @len: node length
+ * @pad: if the buffer has to be padded
+ *
+ * This function prepares node at @node to be written to the media - it
+ * calculates node CRC, fills the common header, and adds proper padding up to
+ * the next minimum I/O unit if @pad is not zero.
+ */
+void ubifs_prepare_node(struct ubifs_info *c, void *node, int len, int pad)
+{
+ uint32_t crc;
+ struct ubifs_ch *ch = node;
+ unsigned long long sqnum = next_sqnum(c);
+
+ ubifs_assert(len >= UBIFS_CH_SZ);
+
+ ch->magic = cpu_to_le32(UBIFS_NODE_MAGIC);
+ ch->len = cpu_to_le32(len);
+ ch->group_type = UBIFS_NO_NODE_GROUP;
+ ch->sqnum = cpu_to_le64(sqnum);
+ ch->padding[0] = ch->padding[1] = 0;
+ crc = crc32(UBIFS_CRC32_INIT, node + 8, len - 8);
+ ch->crc = cpu_to_le32(crc);
+
+ if (pad) {
+ len = ALIGN(len, 8);
+ pad = ALIGN(len, c->min_io_size) - len;
+ ubifs_pad(c, node + len, pad);
+ }
+}
+
+/**
+ * ubifs_prep_grp_node - prepare node of a group to be written to flash.
+ * @c: UBIFS file-system description object
+ * @node: the node to pad
+ * @len: node length
+ * @last: indicates the last node of the group
+ *
+ * This function prepares node at @node to be written to the media - it
+ * calculates node CRC and fills the common header.
+ */
+void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last)
+{
+ uint32_t crc;
+ struct ubifs_ch *ch = node;
+ unsigned long long sqnum = next_sqnum(c);
+
+ ubifs_assert(len >= UBIFS_CH_SZ);
+
+ ch->magic = cpu_to_le32(UBIFS_NODE_MAGIC);
+ ch->len = cpu_to_le32(len);
+ if (last)
+ ch->group_type = UBIFS_LAST_OF_NODE_GROUP;
+ else
+ ch->group_type = UBIFS_IN_NODE_GROUP;
+ ch->sqnum = cpu_to_le64(sqnum);
+ ch->padding[0] = ch->padding[1] = 0;
+ crc = crc32(UBIFS_CRC32_INIT, node + 8, len - 8);
+ ch->crc = cpu_to_le32(crc);
+}
+
+/**
+ * wbuf_timer_callback - write-buffer timer callback function.
+ * @data: timer data (write-buffer descriptor)
+ *
+ * This function is called when the write-buffer timer expires.
+ */
+static void wbuf_timer_callback_nolock(unsigned long data)
+{
+ struct ubifs_wbuf *wbuf = (struct ubifs_wbuf *)data;
+
+ wbuf->need_sync = 1;
+ wbuf->c->need_wbuf_sync = 1;
+ ubifs_wake_up_bgt(wbuf->c);
+}
+
+/**
+ * new_wbuf_timer - start new write-buffer timer.
+ * @wbuf: write-buffer descriptor
+ */
+static void new_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
+{
+ ubifs_assert(!timer_pending(&wbuf->timer));
+
+ if (!wbuf->timeout)
+ return;
+
+ wbuf->timer.expires = jiffies + wbuf->timeout;
+ add_timer(&wbuf->timer);
+}
+
+/**
+ * cancel_wbuf_timer - cancel write-buffer timer.
+ * @wbuf: write-buffer descriptor
+ */
+static void cancel_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
+{
+ /*
+ * If the syncer is waiting for the lock (from the background thread's
+ * context) and another task is changing write-buffer then the syncing
+ * should be canceled.
+ */
+ wbuf->need_sync = 0;
+ del_timer(&wbuf->timer);
+}
+
+/**
+ * ubifs_wbuf_sync_nolock - synchronize write-buffer.
+ * @wbuf: write-buffer to synchronize
+ *
+ * This function synchronizes write-buffer @buf and returns zero in case of
+ * success or a negative error code in case of failure.
+ */
+int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
+{
+ struct ubifs_info *c = wbuf->c;
+ int err, dirt;
+
+ cancel_wbuf_timer_nolock(wbuf);
+ if (!wbuf->used || wbuf->lnum == -1)
+ /* Write-buffer is empty or not seeked */
+ return 0;
+
+ dbg_io("LEB %d:%d, %d bytes",
+ wbuf->lnum, wbuf->offs, wbuf->used);
+ ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY));
+ ubifs_assert(!(wbuf->avail & 7));
+ ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size);
+
+ if (c->ro_media)
+ return -EROFS;
+
+ ubifs_pad(c, wbuf->buf + wbuf->used, wbuf->avail);
+ err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
+ c->min_io_size, wbuf->dtype);
+ if (err) {
+ ubifs_err("cannot write %d bytes to LEB %d:%d",
+ c->min_io_size, wbuf->lnum, wbuf->offs);
+ dbg_dump_stack();
+ return err;
+ }
+
+ dirt = wbuf->avail;
+
+ spin_lock(&wbuf->lock);
+ wbuf->offs += c->min_io_size;
+ wbuf->avail = c->min_io_size;
+ wbuf->used = 0;
+ wbuf->next_ino = 0;
+ spin_unlock(&wbuf->lock);
+
+ if (wbuf->sync_callback)
+ err = wbuf->sync_callback(c, wbuf->lnum,
+ c->leb_size - wbuf->offs, dirt);
+ return err;
+}
+
+/**
+ * ubifs_wbuf_seek_nolock - seek write-buffer.
+ * @wbuf: write-buffer
+ * @lnum: logical eraseblock number to seek to
+ * @offs: logical eraseblock offset to seek to
+ * @dtype: data type
+ *
+ * This function targets the write buffer to logical eraseblock @lnum:@offs.
+ * The write-buffer is synchronized if it is not empty. Returns zero in case of
+ * success and a negative error code in case of failure.
+ */
+int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
+ int dtype)
+{
+ const struct ubifs_info *c = wbuf->c;
+
+ dbg_io("LEB %d:%d", lnum, offs);
+ ubifs_assert(lnum >= 0 && lnum < c->leb_cnt);
+ ubifs_assert(offs >= 0 && offs <= c->leb_size);
+ ubifs_assert(offs % c->min_io_size == 0 && !(offs & 7));
+ ubifs_assert(lnum != wbuf->lnum);
+
+ if (wbuf->used > 0) {
+ int err = ubifs_wbuf_sync_nolock(wbuf);
+
+ if (err)
+ return err;
+ }
+
+ spin_lock(&wbuf->lock);
+ wbuf->lnum = lnum;
+ wbuf->offs = offs;
+ wbuf->avail = c->min_io_size;
+ wbuf->used = 0;
+ spin_unlock(&wbuf->lock);
+ wbuf->dtype = dtype;
+
+ return 0;
+}
+
+/**
+ * ubifs_bg_wbufs_sync - synchronize write-buffers.
+ * @c: UBIFS file-system description object
+ *
+ * This function is called by background thread to synchronize write-buffers.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+int ubifs_bg_wbufs_sync(struct ubifs_info *c)
+{
+ int err, i;
+
+ if (!c->need_wbuf_sync)
+ return 0;
+ c->need_wbuf_sync = 0;
+
+ if (c->ro_media) {
+ err = -EROFS;
+ goto out_timers;
+ }
+
+ dbg_io("synchronize");
+ for (i = 0; i < c->jhead_cnt; i++) {
+ struct ubifs_wbuf *wbuf = &c->jheads[i].wbuf;
+
+ cond_resched();
+
+ /*
+ * If the mutex is locked then wbuf is being changed, so
+ * synchronization is not necessary.
+ */
+ if (mutex_is_locked(&wbuf->io_mutex))
+ continue;
+
+ mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+ if (!wbuf->need_sync) {
+ mutex_unlock(&wbuf->io_mutex);
+ continue;
+ }
+
+ err = ubifs_wbuf_sync_nolock(wbuf);
+ mutex_unlock(&wbuf->io_mutex);
+ if (err) {
+ ubifs_err("cannot sync write-buffer, error %d", err);
+ ubifs_ro_mode(c, err);
+ goto out_timers;
+ }
+ }
+
+ return 0;
+
+out_timers:
+ /* Cancel all timers to prevent repeated errors */
+ for (i = 0; i < c->jhead_cnt; i++) {
+ struct ubifs_wbuf *wbuf = &c->jheads[i].wbuf;
+
+ mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+ cancel_wbuf_timer_nolock(wbuf);
+ mutex_unlock(&wbuf->io_mutex);
+ }
+ return err;
+}
+
+/**
+ * ubifs_wbuf_write_nolock - write data to flash via write-buffer.
+ * @wbuf: write-buffer
+ * @buf: node to write
+ * @len: node length
+ *
+ * This function writes data to flash via write-buffer @wbuf. This means that
+ * the last piece of the node won't reach the flash media immediately if it
+ * does not take whole minimal I/O unit. Instead, the node will sit in RAM
+ * until the write-buffer is synchronized (e.g., by timer).
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure. If the node cannot be written because there is no more
+ * space in this logical eraseblock, %-ENOSPC is returned.
+ */
+int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
+{
+ struct ubifs_info *c = wbuf->c;
+ int err, written, n, aligned_len = ALIGN(len, 8), offs;
+
+ dbg_io("%d bytes (%s) to wbuf at LEB %d:%d", len,
+ dbg_ntype(((struct ubifs_ch *)buf)->node_type), wbuf->lnum,
+ wbuf->offs + wbuf->used);
+ ubifs_assert(len > 0 && wbuf->lnum >= 0 && wbuf->lnum < c->leb_cnt);
+ ubifs_assert(wbuf->offs >= 0 && wbuf->offs % c->min_io_size == 0);
+ ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size);
+ ubifs_assert(wbuf->avail > 0 && wbuf->avail <= c->min_io_size);
+ ubifs_assert(mutex_is_locked(&wbuf->io_mutex));
+
+ if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) {
+ err = -ENOSPC;
+ goto out;
+ }
+
+ cancel_wbuf_timer_nolock(wbuf);
+
+ if (c->ro_media)
+ return -EROFS;
+
+ if (aligned_len <= wbuf->avail) {
+ /*
+ * The node is not very large and fits entirely within
+ * write-buffer.
+ */
+ memcpy(wbuf->buf + wbuf->used, buf, len);
+
+ if (aligned_len == wbuf->avail) {
+ dbg_io("flush wbuf to LEB %d:%d", wbuf->lnum,
+ wbuf->offs);
+ err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf,
+ wbuf->offs, c->min_io_size,
+ wbuf->dtype);
+ if (err)
+ goto out;
+
+ spin_lock(&wbuf->lock);
+ wbuf->offs += c->min_io_size;
+ wbuf->avail = c->min_io_size;
+ wbuf->used = 0;
+ wbuf->next_ino = 0;
+ spin_unlock(&wbuf->lock);
+ } else {
+ spin_lock(&wbuf->lock);
+ wbuf->avail -= aligned_len;
+ wbuf->used += aligned_len;
+ spin_unlock(&wbuf->lock);
+ }
+
+ goto exit;
+ }
+
+ /*
+ * The node is large enough and does not fit entirely within current
+ * minimal I/O unit. We have to fill and flush write-buffer and switch
+ * to the next min. I/O unit.
+ */
+ dbg_io("flush wbuf to LEB %d:%d", wbuf->lnum, wbuf->offs);
+ memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail);
+ err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
+ c->min_io_size, wbuf->dtype);
+ if (err)
+ goto out;
+
+ offs = wbuf->offs + c->min_io_size;
+ len -= wbuf->avail;
+ aligned_len -= wbuf->avail;
+ written = wbuf->avail;
+
+ /*
+ * The remaining data may take more whole min. I/O units, so write the
+ * remains multiple to min. I/O unit size directly to the flash media.
+ * We align node length to 8-byte boundary because we anyway flash wbuf
+ * if the remaining space is less than 8 bytes.
+ */
+ n = aligned_len >> c->min_io_shift;
+ if (n) {
+ n <<= c->min_io_shift;
+ dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, offs);
+ err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written, offs, n,
+ wbuf->dtype);
+ if (err)
+ goto out;
+ offs += n;
+ aligned_len -= n;
+ len -= n;
+ written += n;
+ }
+
+ spin_lock(&wbuf->lock);
+ if (aligned_len)
+ /*
+ * And now we have what's left and what does not take whole
+ * min. I/O unit, so write it to the write-buffer and we are
+ * done.
+ */
+ memcpy(wbuf->buf, buf + written, len);
+
+ wbuf->offs = offs;
+ wbuf->used = aligned_len;
+ wbuf->avail = c->min_io_size - aligned_len;
+ wbuf->next_ino = 0;
+ spin_unlock(&wbuf->lock);
+
+exit:
+ if (wbuf->sync_callback) {
+ int free = c->leb_size - wbuf->offs - wbuf->used;
+
+ err = wbuf->sync_callback(c, wbuf->lnum, free, 0);
+ if (err)
+ goto out;
+ }
+
+ if (wbuf->used)
+ new_wbuf_timer_nolock(wbuf);
+
+ return 0;
+
+out:
+ ubifs_err("cannot write %d bytes to LEB %d:%d, error %d",
+ len, wbuf->lnum, wbuf->offs, err);
+ dbg_dump_node(c, buf);
+ dbg_dump_stack();
+ dbg_dump_leb(c, wbuf->lnum);
+ return err;
+}
+
+/**
+ * ubifs_write_node - write node to the media.
+ * @c: UBIFS file-system description object
+ * @buf: the node to write
+ * @len: node length
+ * @lnum: logical eraseblock number
+ * @offs: offset within the logical eraseblock
+ * @dtype: node life-time hint (%UBI_LONGTERM, %UBI_SHORTTERM, %UBI_UNKNOWN)
+ *
+ * This function automatically fills node magic number, assigns sequence
+ * number, and calculates node CRC checksum. The length of the @buf buffer has
+ * to be aligned to the minimal I/O unit size. This function automatically
+ * appends padding node and padding bytes if needed. Returns zero in case of
+ * success and a negative error code in case of failure.
+ */
+int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum,
+ int offs, int dtype)
+{
+ int err, buf_len = ALIGN(len, c->min_io_size);
+
+ dbg_io("LEB %d:%d, %s, length %d (aligned %d)",
+ lnum, offs, dbg_ntype(((struct ubifs_ch *)buf)->node_type), len,
+ buf_len);
+ ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
+ ubifs_assert(offs % c->min_io_size == 0 && offs < c->leb_size);
+
+ if (c->ro_media)
+ return -EROFS;
+
+ ubifs_prepare_node(c, buf, len, 1);
+ err = ubi_leb_write(c->ubi, lnum, buf, offs, buf_len, dtype);
+ if (err) {
+ ubifs_err("cannot write %d bytes to LEB %d:%d, error %d",
+ buf_len, lnum, offs, err);
+ dbg_dump_node(c, buf);
+ dbg_dump_stack();
+ }
+
+ return err;
+}
+
+/**
+ * ubifs_read_node_wbuf - read node from the media or write-buffer.
+ * @wbuf: wbuf to check for un-written data
+ * @buf: buffer to read to
+ * @type: node type
+ * @len: node length
+ * @lnum: logical eraseblock number
+ * @offs: offset within the logical eraseblock
+ *
+ * This function reads a node of known type and length, checks it and stores
+ * in @buf. If the node partially or fully sits in the write-buffer, this
+ * function takes data from the buffer, otherwise it reads the flash media.
+ * Returns zero in case of success, %-EUCLEAN if CRC mismatched and a negative
+ * error code in case of failure.
+ */
+int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
+ int lnum, int offs)
+{
+ const struct ubifs_info *c = wbuf->c;
+ int err, rlen, overlap;
+ struct ubifs_ch *ch = buf;
+
+ dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len);
+ ubifs_assert(wbuf && lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
+ ubifs_assert(!(offs & 7) && offs < c->leb_size);
+ ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT);
+
+ spin_lock(&wbuf->lock);
+ overlap = (lnum == wbuf->lnum && offs + len > wbuf->offs);
+ if (!overlap) {
+ /* We may safely unlock the write-buffer and read the data */
+ spin_unlock(&wbuf->lock);
+ return ubifs_read_node(c, buf, type, len, lnum, offs);
+ }
+
+ /* Don't read under wbuf */
+ rlen = wbuf->offs - offs;
+ if (rlen < 0)
+ rlen = 0;
+
+ /* Copy the rest from the write-buffer */
+ memcpy(buf + rlen, wbuf->buf + offs + rlen - wbuf->offs, len - rlen);
+ spin_unlock(&wbuf->lock);
+
+ if (rlen > 0) {
+ /* Read everything that goes before write-buffer */
+ err = ubi_read(c->ubi, lnum, buf, offs, rlen);
+ if (err && err != -EBADMSG) {
+ ubifs_err("failed to read node %d from LEB %d:%d, "
+ "error %d", type, lnum, offs, err);
+ dbg_dump_stack();
+ return err;
+ }
+ }
+
+ if (type != ch->node_type) {
+ ubifs_err("bad node type (%d but expected %d)",
+ ch->node_type, type);
+ goto out;
+ }
+
+ err = ubifs_check_node(c, buf, lnum, offs, 0);
+ if (err) {
+ ubifs_err("expected node type %d", type);
+ return err;
+ }
+
+ rlen = le32_to_cpu(ch->len);
+ if (rlen != len) {
+ ubifs_err("bad node length %d, expected %d", rlen, len);
+ goto out;
+ }
+
+ return 0;
+
+out:
+ ubifs_err("bad node at LEB %d:%d", lnum, offs);
+ dbg_dump_node(c, buf);
+ dbg_dump_stack();
+ return -EINVAL;
+}
+
+/**
+ * ubifs_read_node - read node.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to read to
+ * @type: node type
+ * @len: node length (not aligned)
+ * @lnum: logical eraseblock number
+ * @offs: offset within the logical eraseblock
+ *
+ * This function reads a node of known type and and length, checks it and
+ * stores in @buf. Returns zero in case of success, %-EUCLEAN if CRC mismatched
+ * and a negative error code in case of failure.
+ */
+int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len,
+ int lnum, int offs)
+{
+ int err, l;
+ struct ubifs_ch *ch = buf;
+
+ dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len);
+ ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0);
+ ubifs_assert(len >= UBIFS_CH_SZ && offs + len <= c->leb_size);
+ ubifs_assert(!(offs & 7) && offs < c->leb_size);
+ ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT);
+
+ err = ubi_read(c->ubi, lnum, buf, offs, len);
+ if (err && err != -EBADMSG) {
+ ubifs_err("cannot read node %d from LEB %d:%d, error %d",
+ type, lnum, offs, err);
+ return err;
+ }
+
+ if (type != ch->node_type) {
+ ubifs_err("bad node type (%d but expected %d)",
+ ch->node_type, type);
+ goto out;
+ }
+
+ err = ubifs_check_node(c, buf, lnum, offs, 0);
+ if (err) {
+ ubifs_err("expected node type %d", type);
+ return err;
+ }
+
+ l = le32_to_cpu(ch->len);
+ if (l != len) {
+ ubifs_err("bad node length %d, expected %d", l, len);
+ goto out;
+ }
+
+ return 0;
+
+out:
+ ubifs_err("bad node at LEB %d:%d", lnum, offs);
+ dbg_dump_node(c, buf);
+ dbg_dump_stack();
+ return -EINVAL;
+}
+
+/**
+ * ubifs_wbuf_init - initialize write-buffer.
+ * @c: UBIFS file-system description object
+ * @wbuf: write-buffer to initialize
+ *
+ * This function initializes write buffer. Returns zero in case of success
+ * %-ENOMEM in case of failure.
+ */
+int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf)
+{
+ size_t size;
+
+ wbuf->buf = kmalloc(c->min_io_size, GFP_KERNEL);
+ if (!wbuf->buf)
+ return -ENOMEM;
+
+ size = (c->min_io_size / UBIFS_CH_SZ + 1) * sizeof(ino_t);
+ wbuf->inodes = kmalloc(size, GFP_KERNEL);
+ if (!wbuf->inodes) {
+ kfree(wbuf->buf);
+ wbuf->buf = NULL;
+ return -ENOMEM;
+ }
+
+ wbuf->used = 0;
+ wbuf->lnum = wbuf->offs = -1;
+ wbuf->avail = c->min_io_size;
+ wbuf->dtype = UBI_UNKNOWN;
+ wbuf->sync_callback = NULL;
+ mutex_init(&wbuf->io_mutex);
+ spin_lock_init(&wbuf->lock);
+
+ wbuf->c = c;
+ init_timer(&wbuf->timer);
+ wbuf->timer.function = wbuf_timer_callback_nolock;
+ wbuf->timer.data = (unsigned long)wbuf;
+ wbuf->timeout = DEFAULT_WBUF_TIMEOUT;
+ wbuf->next_ino = 0;
+
+ return 0;
+}
+
+/**
+ * ubifs_wbuf_add_ino_nolock - add an inode number into the wbuf inode array.
+ * @wbuf: the write-buffer whereto add
+ * @inum: the inode number
+ *
+ * This function adds an inode number to the inode array of the write-buffer.
+ */
+void ubifs_wbuf_add_ino_nolock(struct ubifs_wbuf *wbuf, ino_t inum)
+{
+ if (!wbuf->buf)
+ /* NOR flash or something similar */
+ return;
+
+ spin_lock(&wbuf->lock);
+ if (wbuf->used)
+ wbuf->inodes[wbuf->next_ino++] = inum;
+ spin_unlock(&wbuf->lock);
+}
+
+/**
+ * wbuf_has_ino - returns if the wbuf contains data from the inode.
+ * @wbuf: the write-buffer
+ * @inum: the inode number
+ *
+ * This function returns with %1 if the write-buffer contains some data from the
+ * given inode otherwise it returns with %0.
+ */
+static int wbuf_has_ino(struct ubifs_wbuf *wbuf, ino_t inum)
+{
+ int i, ret = 0;
+
+ spin_lock(&wbuf->lock);
+ for (i = 0; i < wbuf->next_ino; i++)
+ if (inum == wbuf->inodes[i]) {
+ ret = 1;
+ break;
+ }
+ spin_unlock(&wbuf->lock);
+
+ return ret;
+}
+
+/**
+ * ubifs_sync_wbufs_by_inode - synchronize write-buffers for an inode.
+ * @c: UBIFS file-system description object
+ * @inode: inode to synchronize
+ *
+ * This function synchronizes write-buffers which contain nodes belonging to
+ * @inode. Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+int ubifs_sync_wbufs_by_inode(struct ubifs_info *c, struct inode *inode)
+{
+ int i, err = 0;
+
+ for (i = 0; i < c->jhead_cnt; i++) {
+ struct ubifs_wbuf *wbuf = &c->jheads[i].wbuf;
+
+ if (i == GCHD)
+ /*
+ * GC head is special, do not look at it. Even if the
+ * head contains something related to this inode, it is
+ * a _copy_ of corresponding on-flash node which sits
+ * somewhere else.
+ */
+ continue;
+
+ if (!wbuf_has_ino(wbuf, inode->i_ino))
+ continue;
+
+ mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+ if (wbuf_has_ino(wbuf, inode->i_ino))
+ err = ubifs_wbuf_sync_nolock(wbuf);
+ mutex_unlock(&wbuf->io_mutex);
+
+ if (err) {
+ ubifs_ro_mode(c, err);
+ return err;
+ }
+ }
+ return 0;
+}
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
new file mode 100644
index 00000000000..5e82cffe969
--- /dev/null
+++ b/fs/ubifs/ioctl.c
@@ -0,0 +1,204 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ * Copyright (C) 2006, 2007 University of Szeged, Hungary
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Zoltan Sogor
+ * Artem Bityutskiy (Битюцкий Артём)
+ * Adrian Hunter
+ */
+
+/* This file implements EXT2-compatible extended attribute ioctl() calls */
+
+#include <linux/compat.h>
+#include <linux/smp_lock.h>
+#include <linux/mount.h>
+#include "ubifs.h"
+
+/**
+ * ubifs_set_inode_flags - set VFS inode flags.
+ * @inode: VFS inode to set flags for
+ *
+ * This function propagates flags from UBIFS inode object to VFS inode object.
+ */
+void ubifs_set_inode_flags(struct inode *inode)
+{
+ unsigned int flags = ubifs_inode(inode)->flags;
+
+ inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_DIRSYNC);
+ if (flags & UBIFS_SYNC_FL)
+ inode->i_flags |= S_SYNC;
+ if (flags & UBIFS_APPEND_FL)
+ inode->i_flags |= S_APPEND;
+ if (flags & UBIFS_IMMUTABLE_FL)
+ inode->i_flags |= S_IMMUTABLE;
+ if (flags & UBIFS_DIRSYNC_FL)
+ inode->i_flags |= S_DIRSYNC;
+}
+
+/*
+ * ioctl2ubifs - convert ioctl inode flags to UBIFS inode flags.
+ * @ioctl_flags: flags to convert
+ *
+ * This function convert ioctl flags (@FS_COMPR_FL, etc) to UBIFS inode flags
+ * (@UBIFS_COMPR_FL, etc).
+ */
+static int ioctl2ubifs(int ioctl_flags)
+{
+ int ubifs_flags = 0;
+
+ if (ioctl_flags & FS_COMPR_FL)
+ ubifs_flags |= UBIFS_COMPR_FL;
+ if (ioctl_flags & FS_SYNC_FL)
+ ubifs_flags |= UBIFS_SYNC_FL;
+ if (ioctl_flags & FS_APPEND_FL)
+ ubifs_flags |= UBIFS_APPEND_FL;
+ if (ioctl_flags & FS_IMMUTABLE_FL)
+ ubifs_flags |= UBIFS_IMMUTABLE_FL;
+ if (ioctl_flags & FS_DIRSYNC_FL)
+ ubifs_flags |= UBIFS_DIRSYNC_FL;
+
+ return ubifs_flags;
+}
+
+/*
+ * ubifs2ioctl - convert UBIFS inode flags to ioctl inode flags.
+ * @ubifs_flags: flags to convert
+ *
+ * This function convert UBIFS (@UBIFS_COMPR_FL, etc) to ioctl flags
+ * (@FS_COMPR_FL, etc).
+ */
+static int ubifs2ioctl(int ubifs_flags)
+{
+ int ioctl_flags = 0;
+
+ if (ubifs_flags & UBIFS_COMPR_FL)
+ ioctl_flags |= FS_COMPR_FL;
+ if (ubifs_flags & UBIFS_SYNC_FL)
+ ioctl_flags |= FS_SYNC_FL;
+ if (ubifs_flags & UBIFS_APPEND_FL)
+ ioctl_flags |= FS_APPEND_FL;
+ if (ubifs_flags & UBIFS_IMMUTABLE_FL)
+ ioctl_flags |= FS_IMMUTABLE_FL;
+ if (ubifs_flags & UBIFS_DIRSYNC_FL)
+ ioctl_flags |= FS_DIRSYNC_FL;
+
+ return ioctl_flags;
+}
+
+static int setflags(struct inode *inode, int flags)
+{
+ int oldflags, err, release;
+ struct ubifs_inode *ui = ubifs_inode(inode);
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
+ struct ubifs_budget_req req = { .dirtied_ino = 1,
+ .dirtied_ino_d = ui->data_len };
+
+ err = ubifs_budget_space(c, &req);
+ if (err)
+ return err;
+
+ /*
+ * The IMMUTABLE and APPEND_ONLY flags can only be changed by
+ * the relevant capability.
+ */
+ mutex_lock(&ui->ui_mutex);
+ oldflags = ubifs2ioctl(ui->flags);
+ if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
+ if (!capable(CAP_LINUX_IMMUTABLE)) {
+ err = -EPERM;
+ goto out_unlock;
+ }
+ }
+
+ ui->flags = ioctl2ubifs(flags);
+ ubifs_set_inode_flags(inode);
+ inode->i_ctime = ubifs_current_time(inode);
+ release = ui->dirty;
+ mark_inode_dirty_sync(inode);
+ mutex_unlock(&ui->ui_mutex);
+
+ if (release)
+ ubifs_release_budget(c, &req);
+ if (IS_SYNC(inode))
+ err = write_inode_now(inode, 1);
+ return err;
+
+out_unlock:
+ ubifs_err("can't modify inode %lu attributes", inode->i_ino);
+ mutex_unlock(&ui->ui_mutex);
+ ubifs_release_budget(c, &req);
+ return err;
+}
+
+long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ int flags, err;
+ struct inode *inode = file->f_path.dentry->d_inode;
+
+ switch (cmd) {
+ case FS_IOC_GETFLAGS:
+ flags = ubifs2ioctl(ubifs_inode(inode)->flags);
+
+ return put_user(flags, (int __user *) arg);
+
+ case FS_IOC_SETFLAGS: {
+ if (IS_RDONLY(inode))
+ return -EROFS;
+
+ if (!is_owner_or_cap(inode))
+ return -EACCES;
+
+ if (get_user(flags, (int __user *) arg))
+ return -EFAULT;
+
+ if (!S_ISDIR(inode->i_mode))
+ flags &= ~FS_DIRSYNC_FL;
+
+ /*
+ * Make sure the file-system is read-write and make sure it
+ * will not become read-only while we are changing the flags.
+ */
+ err = mnt_want_write(file->f_path.mnt);
+ if (err)
+ return err;
+ err = setflags(inode, flags);
+ mnt_drop_write(file->f_path.mnt);
+ return err;
+ }
+
+ default:
+ return -ENOTTY;
+ }
+}
+
+#ifdef CONFIG_COMPAT
+long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case FS_IOC32_GETFLAGS:
+ cmd = FS_IOC_GETFLAGS;
+ break;
+ case FS_IOC32_SETFLAGS:
+ cmd = FS_IOC_SETFLAGS;
+ break;
+ default:
+ return -ENOIOCTLCMD;
+ }
+ return ubifs_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
new file mode 100644
index 00000000000..22993f867d1
--- /dev/null
+++ b/fs/ubifs/journal.c
@@ -0,0 +1,1441 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ * Adrian Hunter
+ */
+
+/*
+ * This file implements UBIFS journal.
+ *
+ * The journal consists of 2 parts - the log and bud LEBs. The log has fixed
+ * length and position, while a bud logical eraseblock is any LEB in the main
+ * area. Buds contain file system data - data nodes, inode nodes, etc. The log
+ * contains only references to buds and some other stuff like commit
+ * start node. The idea is that when we commit the journal, we do
+ * not copy the data, the buds just become indexed. Since after the commit the
+ * nodes in bud eraseblocks become leaf nodes of the file system index tree, we
+ * use term "bud". Analogy is obvious, bud eraseblocks contain nodes which will
+ * become leafs in the future.
+ *
+ * The journal is multi-headed because we want to write data to the journal as
+ * optimally as possible. It is nice to have nodes belonging to the same inode
+ * in one LEB, so we may write data owned by different inodes to different
+ * journal heads, although at present only one data head is used.
+ *
+ * For recovery reasons, the base head contains all inode nodes, all directory
+ * entry nodes and all truncate nodes. This means that the other heads contain
+ * only data nodes.
+ *
+ * Bud LEBs may be half-indexed. For example, if the bud was not full at the
+ * time of commit, the bud is retained to continue to be used in the journal,
+ * even though the "front" of the LEB is now indexed. In that case, the log
+ * reference contains the offset where the bud starts for the purposes of the
+ * journal.
+ *
+ * The journal size has to be limited, because the larger is the journal, the
+ * longer it takes to mount UBIFS (scanning the journal) and the more memory it
+ * takes (indexing in the TNC).
+ *
+ * All the journal write operations like 'ubifs_jnl_update()' here, which write
+ * multiple UBIFS nodes to the journal at one go, are atomic with respect to
+ * unclean reboots. Should the unclean reboot happen, the recovery code drops
+ * all the nodes.
+ */
+
+#include "ubifs.h"
+
+/**
+ * zero_ino_node_unused - zero out unused fields of an on-flash inode node.
+ * @ino: the inode to zero out
+ */
+static inline void zero_ino_node_unused(struct ubifs_ino_node *ino)
+{
+ memset(ino->padding1, 0, 4);
+ memset(ino->padding2, 0, 26);
+}
+
+/**
+ * zero_dent_node_unused - zero out unused fields of an on-flash directory
+ * entry node.
+ * @dent: the directory entry to zero out
+ */
+static inline void zero_dent_node_unused(struct ubifs_dent_node *dent)
+{
+ dent->padding1 = 0;
+ memset(dent->padding2, 0, 4);
+}
+
+/**
+ * zero_data_node_unused - zero out unused fields of an on-flash data node.
+ * @data: the data node to zero out
+ */
+static inline void zero_data_node_unused(struct ubifs_data_node *data)
+{
+ memset(data->padding, 0, 2);
+}
+
+/**
+ * zero_trun_node_unused - zero out unused fields of an on-flash truncation
+ * node.
+ * @trun: the truncation node to zero out
+ */
+static inline void zero_trun_node_unused(struct ubifs_trun_node *trun)
+{
+ memset(trun->padding, 0, 12);
+}
+
+/**
+ * reserve_space - reserve space in the journal.
+ * @c: UBIFS file-system description object
+ * @jhead: journal head number
+ * @len: node length
+ *
+ * This function reserves space in journal head @head. If the reservation
+ * succeeded, the journal head stays locked and later has to be unlocked using
+ * 'release_head()'. 'write_node()' and 'write_head()' functions also unlock
+ * it. Returns zero in case of success, %-EAGAIN if commit has to be done, and
+ * other negative error codes in case of other failures.
+ */
+static int reserve_space(struct ubifs_info *c, int jhead, int len)
+{
+ int err = 0, err1, retries = 0, avail, lnum, offs, free, squeeze;
+ struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf;
+
+ /*
+ * Typically, the base head has smaller nodes written to it, so it is
+ * better to try to allocate space at the ends of eraseblocks. This is
+ * what the squeeze parameter does.
+ */
+ squeeze = (jhead == BASEHD);
+again:
+ mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+
+ if (c->ro_media) {
+ err = -EROFS;
+ goto out_unlock;
+ }
+
+ avail = c->leb_size - wbuf->offs - wbuf->used;
+ if (wbuf->lnum != -1 && avail >= len)
+ return 0;
+
+ /*
+ * Write buffer wasn't seek'ed or there is no enough space - look for an
+ * LEB with some empty space.
+ */
+ lnum = ubifs_find_free_space(c, len, &free, squeeze);
+ if (lnum >= 0) {
+ /* Found an LEB, add it to the journal head */
+ offs = c->leb_size - free;
+ err = ubifs_add_bud_to_log(c, jhead, lnum, offs);
+ if (err)
+ goto out_return;
+ /* A new bud was successfully allocated and added to the log */
+ goto out;
+ }
+
+ err = lnum;
+ if (err != -ENOSPC)
+ goto out_unlock;
+
+ /*
+ * No free space, we have to run garbage collector to make
+ * some. But the write-buffer mutex has to be unlocked because
+ * GC also takes it.
+ */
+ dbg_jnl("no free space jhead %d, run GC", jhead);
+ mutex_unlock(&wbuf->io_mutex);
+
+ lnum = ubifs_garbage_collect(c, 0);
+ if (lnum < 0) {
+ err = lnum;
+ if (err != -ENOSPC)
+ return err;
+
+ /*
+ * GC could not make a free LEB. But someone else may
+ * have allocated new bud for this journal head,
+ * because we dropped @wbuf->io_mutex, so try once
+ * again.
+ */
+ dbg_jnl("GC couldn't make a free LEB for jhead %d", jhead);
+ if (retries++ < 2) {
+ dbg_jnl("retry (%d)", retries);
+ goto again;
+ }
+
+ dbg_jnl("return -ENOSPC");
+ return err;
+ }
+
+ mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+ dbg_jnl("got LEB %d for jhead %d", lnum, jhead);
+ avail = c->leb_size - wbuf->offs - wbuf->used;
+
+ if (wbuf->lnum != -1 && avail >= len) {
+ /*
+ * Someone else has switched the journal head and we have
+ * enough space now. This happens when more then one process is
+ * trying to write to the same journal head at the same time.
+ */
+ dbg_jnl("return LEB %d back, already have LEB %d:%d",
+ lnum, wbuf->lnum, wbuf->offs + wbuf->used);
+ err = ubifs_return_leb(c, lnum);
+ if (err)
+ goto out_unlock;
+ return 0;
+ }
+
+ err = ubifs_add_bud_to_log(c, jhead, lnum, 0);
+ if (err)
+ goto out_return;
+ offs = 0;
+
+out:
+ err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, UBI_SHORTTERM);
+ if (err)
+ goto out_unlock;
+
+ return 0;
+
+out_unlock:
+ mutex_unlock(&wbuf->io_mutex);
+ return err;
+
+out_return:
+ /* An error occurred and the LEB has to be returned to lprops */
+ ubifs_assert(err < 0);
+ err1 = ubifs_return_leb(c, lnum);
+ if (err1 && err == -EAGAIN)
+ /*
+ * Return original error code only if it is not %-EAGAIN,
+ * which is not really an error. Otherwise, return the error
+ * code of 'ubifs_return_leb()'.
+ */
+ err = err1;
+ mutex_unlock(&wbuf->io_mutex);
+ return err;
+}
+
+/**
+ * write_node - write node to a journal head.
+ * @c: UBIFS file-system description object
+ * @jhead: journal head
+ * @node: node to write
+ * @len: node length
+ * @lnum: LEB number written is returned here
+ * @offs: offset written is returned here
+ *
+ * This function writes a node to reserved space of journal head @jhead.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+static int write_node(struct ubifs_info *c, int jhead, void *node, int len,
+ int *lnum, int *offs)
+{
+ struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf;
+
+ ubifs_assert(jhead != GCHD);
+
+ *lnum = c->jheads[jhead].wbuf.lnum;
+ *offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used;
+
+ dbg_jnl("jhead %d, LEB %d:%d, len %d", jhead, *lnum, *offs, len);
+ ubifs_prepare_node(c, node, len, 0);
+
+ return ubifs_wbuf_write_nolock(wbuf, node, len);
+}
+
+/**
+ * write_head - write data to a journal head.
+ * @c: UBIFS file-system description object
+ * @jhead: journal head
+ * @buf: buffer to write
+ * @len: length to write
+ * @lnum: LEB number written is returned here
+ * @offs: offset written is returned here
+ * @sync: non-zero if the write-buffer has to by synchronized
+ *
+ * This function is the same as 'write_node()' but it does not assume the
+ * buffer it is writing is a node, so it does not prepare it (which means
+ * initializing common header and calculating CRC).
+ */
+static int write_head(struct ubifs_info *c, int jhead, void *buf, int len,
+ int *lnum, int *offs, int sync)
+{
+ int err;
+ struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf;
+
+ ubifs_assert(jhead != GCHD);
+
+ *lnum = c->jheads[jhead].wbuf.lnum;
+ *offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used;
+ dbg_jnl("jhead %d, LEB %d:%d, len %d", jhead, *lnum, *offs, len);
+
+ err = ubifs_wbuf_write_nolock(wbuf, buf, len);
+ if (err)
+ return err;
+ if (sync)
+ err = ubifs_wbuf_sync_nolock(wbuf);
+ return err;
+}
+
+/**
+ * make_reservation - reserve journal space.
+ * @c: UBIFS file-system description object
+ * @jhead: journal head
+ * @len: how many bytes to reserve
+ *
+ * This function makes space reservation in journal head @jhead. The function
+ * takes the commit lock and locks the journal head, and the caller has to
+ * unlock the head and finish the reservation with 'finish_reservation()'.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ *
+ * Note, the journal head may be unlocked as soon as the data is written, while
+ * the commit lock has to be released after the data has been added to the
+ * TNC.
+ */
+static int make_reservation(struct ubifs_info *c, int jhead, int len)
+{
+ int err, cmt_retries = 0, nospc_retries = 0;
+
+again:
+ down_read(&c->commit_sem);
+ err = reserve_space(c, jhead, len);
+ if (!err)
+ return 0;
+ up_read(&c->commit_sem);
+
+ if (err == -ENOSPC) {
+ /*
+ * GC could not make any progress. We should try to commit
+ * once because it could make some dirty space and GC would
+ * make progress, so make the error -EAGAIN so that the below
+ * will commit and re-try.
+ */
+ if (nospc_retries++ < 2) {
+ dbg_jnl("no space, retry");
+ err = -EAGAIN;
+ }
+
+ /*
+ * This means that the budgeting is incorrect. We always have
+ * to be able to write to the media, because all operations are
+ * budgeted. Deletions are not budgeted, though, but we reserve
+ * an extra LEB for them.
+ */
+ }
+
+ if (err != -EAGAIN)
+ goto out;
+
+ /*
+ * -EAGAIN means that the journal is full or too large, or the above
+ * code wants to do one commit. Do this and re-try.
+ */
+ if (cmt_retries > 128) {
+ /*
+ * This should not happen unless the journal size limitations
+ * are too tough.
+ */
+ ubifs_err("stuck in space allocation");
+ err = -ENOSPC;
+ goto out;
+ } else if (cmt_retries > 32)
+ ubifs_warn("too many space allocation re-tries (%d)",
+ cmt_retries);
+
+ dbg_jnl("-EAGAIN, commit and retry (retried %d times)",
+ cmt_retries);
+ cmt_retries += 1;
+
+ err = ubifs_run_commit(c);
+ if (err)
+ return err;
+ goto again;
+
+out:
+ ubifs_err("cannot reserve %d bytes in jhead %d, error %d",
+ len, jhead, err);
+ if (err == -ENOSPC) {
+ /* This are some budgeting problems, print useful information */
+ down_write(&c->commit_sem);
+ spin_lock(&c->space_lock);
+ dbg_dump_stack();
+ dbg_dump_budg(c);
+ spin_unlock(&c->space_lock);
+ dbg_dump_lprops(c);
+ cmt_retries = dbg_check_lprops(c);
+ up_write(&c->commit_sem);
+ }
+ return err;
+}
+
+/**
+ * release_head - release a journal head.
+ * @c: UBIFS file-system description object
+ * @jhead: journal head
+ *
+ * This function releases journal head @jhead which was locked by
+ * the 'make_reservation()' function. It has to be called after each successful
+ * 'make_reservation()' invocation.
+ */
+static inline void release_head(struct ubifs_info *c, int jhead)
+{
+ mutex_unlock(&c->jheads[jhead].wbuf.io_mutex);
+}
+
+/**
+ * finish_reservation - finish a reservation.
+ * @c: UBIFS file-system description object
+ *
+ * This function finishes journal space reservation. It must be called after
+ * 'make_reservation()'.
+ */
+static void finish_reservation(struct ubifs_info *c)
+{
+ up_read(&c->commit_sem);
+}
+
+/**
+ * get_dent_type - translate VFS inode mode to UBIFS directory entry type.
+ * @mode: inode mode
+ */
+static int get_dent_type(int mode)
+{
+ switch (mode & S_IFMT) {
+ case S_IFREG:
+ return UBIFS_ITYPE_REG;
+ case S_IFDIR:
+ return UBIFS_ITYPE_DIR;
+ case S_IFLNK:
+ return UBIFS_ITYPE_LNK;
+ case S_IFBLK:
+ return UBIFS_ITYPE_BLK;
+ case S_IFCHR:
+ return UBIFS_ITYPE_CHR;
+ case S_IFIFO:
+ return UBIFS_ITYPE_FIFO;
+ case S_IFSOCK:
+ return UBIFS_ITYPE_SOCK;
+ default:
+ BUG();
+ }
+ return 0;
+}
+
+/**
+ * pack_inode - pack an inode node.
+ * @c: UBIFS file-system description object
+ * @ino: buffer in which to pack inode node
+ * @inode: inode to pack
+ * @last: indicates the last node of the group
+ */
+static void pack_inode(struct ubifs_info *c, struct ubifs_ino_node *ino,
+ const struct inode *inode, int last)
+{
+ int data_len = 0, last_reference = !inode->i_nlink;
+ struct ubifs_inode *ui = ubifs_inode(inode);
+
+ ino->ch.node_type = UBIFS_INO_NODE;
+ ino_key_init_flash(c, &ino->key, inode->i_ino);
+ ino->creat_sqnum = cpu_to_le64(ui->creat_sqnum);
+ ino->atime_sec = cpu_to_le64(inode->i_atime.tv_sec);
+ ino->atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
+ ino->ctime_sec = cpu_to_le64(inode->i_ctime.tv_sec);
+ ino->ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ ino->mtime_sec = cpu_to_le64(inode->i_mtime.tv_sec);
+ ino->mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+ ino->uid = cpu_to_le32(inode->i_uid);
+ ino->gid = cpu_to_le32(inode->i_gid);
+ ino->mode = cpu_to_le32(inode->i_mode);
+ ino->flags = cpu_to_le32(ui->flags);
+ ino->size = cpu_to_le64(ui->ui_size);
+ ino->nlink = cpu_to_le32(inode->i_nlink);
+ ino->compr_type = cpu_to_le16(ui->compr_type);
+ ino->data_len = cpu_to_le32(ui->data_len);
+ ino->xattr_cnt = cpu_to_le32(ui->xattr_cnt);
+ ino->xattr_size = cpu_to_le32(ui->xattr_size);
+ ino->xattr_names = cpu_to_le32(ui->xattr_names);
+ zero_ino_node_unused(ino);
+
+ /*
+ * Drop the attached data if this is a deletion inode, the data is not
+ * needed anymore.
+ */
+ if (!last_reference) {
+ memcpy(ino->data, ui->data, ui->data_len);
+ data_len = ui->data_len;
+ }
+
+ ubifs_prep_grp_node(c, ino, UBIFS_INO_NODE_SZ + data_len, last);
+}
+
+/**
+ * mark_inode_clean - mark UBIFS inode as clean.
+ * @c: UBIFS file-system description object
+ * @ui: UBIFS inode to mark as clean
+ *
+ * This helper function marks UBIFS inode @ui as clean by cleaning the
+ * @ui->dirty flag and releasing its budget. Note, VFS may still treat the
+ * inode as dirty and try to write it back, but 'ubifs_write_inode()' would
+ * just do nothing.
+ */
+static void mark_inode_clean(struct ubifs_info *c, struct ubifs_inode *ui)
+{
+ if (ui->dirty)
+ ubifs_release_dirty_inode_budget(c, ui);
+ ui->dirty = 0;
+}
+
+/**
+ * ubifs_jnl_update - update inode.
+ * @c: UBIFS file-system description object
+ * @dir: parent inode or host inode in case of extended attributes
+ * @nm: directory entry name
+ * @inode: inode to update
+ * @deletion: indicates a directory entry deletion i.e unlink or rmdir
+ * @xent: non-zero if the directory entry is an extended attribute entry
+ *
+ * This function updates an inode by writing a directory entry (or extended
+ * attribute entry), the inode itself, and the parent directory inode (or the
+ * host inode) to the journal.
+ *
+ * The function writes the host inode @dir last, which is important in case of
+ * extended attributes. Indeed, then we guarantee that if the host inode gets
+ * synchronized (with 'fsync()'), and the write-buffer it sits in gets flushed,
+ * the extended attribute inode gets flushed too. And this is exactly what the
+ * user expects - synchronizing the host inode synchronizes its extended
+ * attributes. Similarly, this guarantees that if @dir is synchronized, its
+ * directory entry corresponding to @nm gets synchronized too.
+ *
+ * If the inode (@inode) or the parent directory (@dir) are synchronous, this
+ * function synchronizes the write-buffer.
+ *
+ * This function marks the @dir and @inode inodes as clean and returns zero on
+ * success. In case of failure, a negative error code is returned.
+ */
+int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
+ const struct qstr *nm, const struct inode *inode,
+ int deletion, int xent)
+{
+ int err, dlen, ilen, len, lnum, ino_offs, dent_offs;
+ int aligned_dlen, aligned_ilen, sync = IS_DIRSYNC(dir);
+ int last_reference = !!(deletion && inode->i_nlink == 0);
+ struct ubifs_inode *ui = ubifs_inode(inode);
+ struct ubifs_inode *dir_ui = ubifs_inode(dir);
+ struct ubifs_dent_node *dent;
+ struct ubifs_ino_node *ino;
+ union ubifs_key dent_key, ino_key;
+
+ dbg_jnl("ino %lu, dent '%.*s', data len %d in dir ino %lu",
+ inode->i_ino, nm->len, nm->name, ui->data_len, dir->i_ino);
+ ubifs_assert(dir_ui->data_len == 0);
+ ubifs_assert(mutex_is_locked(&dir_ui->ui_mutex));
+
+ dlen = UBIFS_DENT_NODE_SZ + nm->len + 1;
+ ilen = UBIFS_INO_NODE_SZ;
+
+ /*
+ * If the last reference to the inode is being deleted, then there is
+ * no need to attach and write inode data, it is being deleted anyway.
+ * And if the inode is being deleted, no need to synchronize
+ * write-buffer even if the inode is synchronous.
+ */
+ if (!last_reference) {
+ ilen += ui->data_len;
+ sync |= IS_SYNC(inode);
+ }
+
+ aligned_dlen = ALIGN(dlen, 8);
+ aligned_ilen = ALIGN(ilen, 8);
+ len = aligned_dlen + aligned_ilen + UBIFS_INO_NODE_SZ;
+ dent = kmalloc(len, GFP_NOFS);
+ if (!dent)
+ return -ENOMEM;
+
+ /* Make reservation before allocating sequence numbers */
+ err = make_reservation(c, BASEHD, len);
+ if (err)
+ goto out_free;
+
+ if (!xent) {
+ dent->ch.node_type = UBIFS_DENT_NODE;
+ dent_key_init(c, &dent_key, dir->i_ino, nm);
+ } else {
+ dent->ch.node_type = UBIFS_XENT_NODE;
+ xent_key_init(c, &dent_key, dir->i_ino, nm);
+ }
+
+ key_write(c, &dent_key, dent->key);
+ dent->inum = deletion ? 0 : cpu_to_le64(inode->i_ino);
+ dent->type = get_dent_type(inode->i_mode);
+ dent->nlen = cpu_to_le16(nm->len);
+ memcpy(dent->name, nm->name, nm->len);
+ dent->name[nm->len] = '\0';
+ zero_dent_node_unused(dent);
+ ubifs_prep_grp_node(c, dent, dlen, 0);
+
+ ino = (void *)dent + aligned_dlen;
+ pack_inode(c, ino, inode, 0);
+ ino = (void *)ino + aligned_ilen;
+ pack_inode(c, ino, dir, 1);
+
+ if (last_reference) {
+ err = ubifs_add_orphan(c, inode->i_ino);
+ if (err) {
+ release_head(c, BASEHD);
+ goto out_finish;
+ }
+ ui->del_cmtno = c->cmt_no;
+ }
+
+ err = write_head(c, BASEHD, dent, len, &lnum, &dent_offs, sync);
+ if (err)
+ goto out_release;
+ if (!sync) {
+ struct ubifs_wbuf *wbuf = &c->jheads[BASEHD].wbuf;
+
+ ubifs_wbuf_add_ino_nolock(wbuf, inode->i_ino);
+ ubifs_wbuf_add_ino_nolock(wbuf, dir->i_ino);
+ }
+ release_head(c, BASEHD);
+ kfree(dent);
+
+ if (deletion) {
+ err = ubifs_tnc_remove_nm(c, &dent_key, nm);
+ if (err)
+ goto out_ro;
+ err = ubifs_add_dirt(c, lnum, dlen);
+ } else
+ err = ubifs_tnc_add_nm(c, &dent_key, lnum, dent_offs, dlen, nm);
+ if (err)
+ goto out_ro;
+
+ /*
+ * Note, we do not remove the inode from TNC even if the last reference
+ * to it has just been deleted, because the inode may still be opened.
+ * Instead, the inode has been added to orphan lists and the orphan
+ * subsystem will take further care about it.
+ */
+ ino_key_init(c, &ino_key, inode->i_ino);
+ ino_offs = dent_offs + aligned_dlen;
+ err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs, ilen);
+ if (err)
+ goto out_ro;
+
+ ino_key_init(c, &ino_key, dir->i_ino);
+ ino_offs += aligned_ilen;
+ err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs, UBIFS_INO_NODE_SZ);
+ if (err)
+ goto out_ro;
+
+ finish_reservation(c);
+ spin_lock(&ui->ui_lock);
+ ui->synced_i_size = ui->ui_size;
+ spin_unlock(&ui->ui_lock);
+ mark_inode_clean(c, ui);
+ mark_inode_clean(c, dir_ui);
+ return 0;
+
+out_finish:
+ finish_reservation(c);
+out_free:
+ kfree(dent);
+ return err;
+
+out_release:
+ release_head(c, BASEHD);
+out_ro:
+ ubifs_ro_mode(c, err);
+ if (last_reference)
+ ubifs_delete_orphan(c, inode->i_ino);
+ finish_reservation(c);
+ return err;
+}
+
+/**
+ * ubifs_jnl_write_data - write a data node to the journal.
+ * @c: UBIFS file-system description object
+ * @inode: inode the data node belongs to
+ * @key: node key
+ * @buf: buffer to write
+ * @len: data length (must not exceed %UBIFS_BLOCK_SIZE)
+ *
+ * This function writes a data node to the journal. Returns %0 if the data node
+ * was successfully written, and a negative error code in case of failure.
+ */
+int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
+ const union ubifs_key *key, const void *buf, int len)
+{
+ struct ubifs_data_node *data;
+ int err, lnum, offs, compr_type, out_len;
+ int dlen = UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE * WORST_COMPR_FACTOR;
+ struct ubifs_inode *ui = ubifs_inode(inode);
+
+ dbg_jnl("ino %lu, blk %u, len %d, key %s", key_inum(c, key),
+ key_block(c, key), len, DBGKEY(key));
+ ubifs_assert(len <= UBIFS_BLOCK_SIZE);
+
+ data = kmalloc(dlen, GFP_NOFS);
+ if (!data)
+ return -ENOMEM;
+
+ data->ch.node_type = UBIFS_DATA_NODE;
+ key_write(c, key, &data->key);
+ data->size = cpu_to_le32(len);
+ zero_data_node_unused(data);
+
+ if (!(ui->flags && UBIFS_COMPR_FL))
+ /* Compression is disabled for this inode */
+ compr_type = UBIFS_COMPR_NONE;
+ else
+ compr_type = ui->compr_type;
+
+ out_len = dlen - UBIFS_DATA_NODE_SZ;
+ ubifs_compress(buf, len, &data->data, &out_len, &compr_type);
+ ubifs_assert(out_len <= UBIFS_BLOCK_SIZE);
+
+ dlen = UBIFS_DATA_NODE_SZ + out_len;
+ data->compr_type = cpu_to_le16(compr_type);
+
+ /* Make reservation before allocating sequence numbers */
+ err = make_reservation(c, DATAHD, dlen);
+ if (err)
+ goto out_free;
+
+ err = write_node(c, DATAHD, data, dlen, &lnum, &offs);
+ if (err)
+ goto out_release;
+ ubifs_wbuf_add_ino_nolock(&c->jheads[DATAHD].wbuf, key_inum(c, key));
+ release_head(c, DATAHD);
+
+ err = ubifs_tnc_add(c, key, lnum, offs, dlen);
+ if (err)
+ goto out_ro;
+
+ finish_reservation(c);
+ kfree(data);
+ return 0;
+
+out_release:
+ release_head(c, DATAHD);
+out_ro:
+ ubifs_ro_mode(c, err);
+ finish_reservation(c);
+out_free:
+ kfree(data);
+ return err;
+}
+
+/**
+ * ubifs_jnl_write_inode - flush inode to the journal.
+ * @c: UBIFS file-system description object
+ * @inode: inode to flush
+ *
+ * This function writes inode @inode to the journal. If the inode is
+ * synchronous, it also synchronizes the write-buffer. Returns zero in case of
+ * success and a negative error code in case of failure.
+ */
+int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode)
+{
+ int err, lnum, offs;
+ struct ubifs_ino_node *ino;
+ struct ubifs_inode *ui = ubifs_inode(inode);
+ int sync = 0, len = UBIFS_INO_NODE_SZ, last_reference = !inode->i_nlink;
+
+ dbg_jnl("ino %lu, nlink %u", inode->i_ino, inode->i_nlink);
+
+ /*
+ * If the inode is being deleted, do not write the attached data. No
+ * need to synchronize the write-buffer either.
+ */
+ if (!last_reference) {
+ len += ui->data_len;
+ sync = IS_SYNC(inode);
+ }
+ ino = kmalloc(len, GFP_NOFS);
+ if (!ino)
+ return -ENOMEM;
+
+ /* Make reservation before allocating sequence numbers */
+ err = make_reservation(c, BASEHD, len);
+ if (err)
+ goto out_free;
+
+ pack_inode(c, ino, inode, 1);
+ err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync);
+ if (err)
+ goto out_release;
+ if (!sync)
+ ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf,
+ inode->i_ino);
+ release_head(c, BASEHD);
+
+ if (last_reference) {
+ err = ubifs_tnc_remove_ino(c, inode->i_ino);
+ if (err)
+ goto out_ro;
+ ubifs_delete_orphan(c, inode->i_ino);
+ err = ubifs_add_dirt(c, lnum, len);
+ } else {
+ union ubifs_key key;
+
+ ino_key_init(c, &key, inode->i_ino);
+ err = ubifs_tnc_add(c, &key, lnum, offs, len);
+ }
+ if (err)
+ goto out_ro;
+
+ finish_reservation(c);
+ spin_lock(&ui->ui_lock);
+ ui->synced_i_size = ui->ui_size;
+ spin_unlock(&ui->ui_lock);
+ kfree(ino);
+ return 0;
+
+out_release:
+ release_head(c, BASEHD);
+out_ro:
+ ubifs_ro_mode(c, err);
+ finish_reservation(c);
+out_free:
+ kfree(ino);
+ return err;
+}
+
+/**
+ * ubifs_jnl_delete_inode - delete an inode.
+ * @c: UBIFS file-system description object
+ * @inode: inode to delete
+ *
+ * This function deletes inode @inode which includes removing it from orphans,
+ * deleting it from TNC and, in some cases, writing a deletion inode to the
+ * journal.
+ *
+ * When regular file inodes are unlinked or a directory inode is removed, the
+ * 'ubifs_jnl_update()' function writes a corresponding deletion inode and
+ * direntry to the media, and adds the inode to orphans. After this, when the
+ * last reference to this inode has been dropped, this function is called. In
+ * general, it has to write one more deletion inode to the media, because if
+ * a commit happened between 'ubifs_jnl_update()' and
+ * 'ubifs_jnl_delete_inode()', the deletion inode is not in the journal
+ * anymore, and in fact it might not be on the flash anymore, because it might
+ * have been garbage-collected already. And for optimization reasons UBIFS does
+ * not read the orphan area if it has been unmounted cleanly, so it would have
+ * no indication in the journal that there is a deleted inode which has to be
+ * removed from TNC.
+ *
+ * However, if there was no commit between 'ubifs_jnl_update()' and
+ * 'ubifs_jnl_delete_inode()', then there is no need to write the deletion
+ * inode to the media for the second time. And this is quite a typical case.
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode)
+{
+ int err;
+ struct ubifs_inode *ui = ubifs_inode(inode);
+
+ ubifs_assert(inode->i_nlink == 0);
+
+ if (ui->del_cmtno != c->cmt_no)
+ /* A commit happened for sure */
+ return ubifs_jnl_write_inode(c, inode);
+
+ down_read(&c->commit_sem);
+ /*
+ * Check commit number again, because the first test has been done
+ * without @c->commit_sem, so a commit might have happened.
+ */
+ if (ui->del_cmtno != c->cmt_no) {
+ up_read(&c->commit_sem);
+ return ubifs_jnl_write_inode(c, inode);
+ }
+
+ err = ubifs_tnc_remove_ino(c, inode->i_ino);
+ if (err)
+ ubifs_ro_mode(c, err);
+ else
+ ubifs_delete_orphan(c, inode->i_ino);
+ up_read(&c->commit_sem);
+ return err;
+}
+
+/**
+ * ubifs_jnl_rename - rename a directory entry.
+ * @c: UBIFS file-system description object
+ * @old_dir: parent inode of directory entry to rename
+ * @old_dentry: directory entry to rename
+ * @new_dir: parent inode of directory entry to rename
+ * @new_dentry: new directory entry (or directory entry to replace)
+ * @sync: non-zero if the write-buffer has to be synchronized
+ *
+ * This function implements the re-name operation which may involve writing up
+ * to 3 inodes and 2 directory entries. It marks the written inodes as clean
+ * and returns zero on success. In case of failure, a negative error code is
+ * returned.
+ */
+int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
+ const struct dentry *old_dentry,
+ const struct inode *new_dir,
+ const struct dentry *new_dentry, int sync)
+{
+ void *p;
+ union ubifs_key key;
+ struct ubifs_dent_node *dent, *dent2;
+ int err, dlen1, dlen2, ilen, lnum, offs, len;
+ const struct inode *old_inode = old_dentry->d_inode;
+ const struct inode *new_inode = new_dentry->d_inode;
+ int aligned_dlen1, aligned_dlen2, plen = UBIFS_INO_NODE_SZ;
+ int last_reference = !!(new_inode && new_inode->i_nlink == 0);
+ int move = (old_dir != new_dir);
+ struct ubifs_inode *uninitialized_var(new_ui);
+
+ dbg_jnl("dent '%.*s' in dir ino %lu to dent '%.*s' in dir ino %lu",
+ old_dentry->d_name.len, old_dentry->d_name.name,
+ old_dir->i_ino, new_dentry->d_name.len,
+ new_dentry->d_name.name, new_dir->i_ino);
+ ubifs_assert(ubifs_inode(old_dir)->data_len == 0);
+ ubifs_assert(ubifs_inode(new_dir)->data_len == 0);
+ ubifs_assert(mutex_is_locked(&ubifs_inode(old_dir)->ui_mutex));
+ ubifs_assert(mutex_is_locked(&ubifs_inode(new_dir)->ui_mutex));
+
+ dlen1 = UBIFS_DENT_NODE_SZ + new_dentry->d_name.len + 1;
+ dlen2 = UBIFS_DENT_NODE_SZ + old_dentry->d_name.len + 1;
+ if (new_inode) {
+ new_ui = ubifs_inode(new_inode);
+ ubifs_assert(mutex_is_locked(&new_ui->ui_mutex));
+ ilen = UBIFS_INO_NODE_SZ;
+ if (!last_reference)
+ ilen += new_ui->data_len;
+ } else
+ ilen = 0;
+
+ aligned_dlen1 = ALIGN(dlen1, 8);
+ aligned_dlen2 = ALIGN(dlen2, 8);
+ len = aligned_dlen1 + aligned_dlen2 + ALIGN(ilen, 8) + ALIGN(plen, 8);
+ if (old_dir != new_dir)
+ len += plen;
+ dent = kmalloc(len, GFP_NOFS);
+ if (!dent)
+ return -ENOMEM;
+
+ /* Make reservation before allocating sequence numbers */
+ err = make_reservation(c, BASEHD, len);
+ if (err)
+ goto out_free;
+
+ /* Make new dent */
+ dent->ch.node_type = UBIFS_DENT_NODE;
+ dent_key_init_flash(c, &dent->key, new_dir->i_ino, &new_dentry->d_name);
+ dent->inum = cpu_to_le64(old_inode->i_ino);
+ dent->type = get_dent_type(old_inode->i_mode);
+ dent->nlen = cpu_to_le16(new_dentry->d_name.len);
+ memcpy(dent->name, new_dentry->d_name.name, new_dentry->d_name.len);
+ dent->name[new_dentry->d_name.len] = '\0';
+ zero_dent_node_unused(dent);
+ ubifs_prep_grp_node(c, dent, dlen1, 0);
+
+ /* Make deletion dent */
+ dent2 = (void *)dent + aligned_dlen1;
+ dent2->ch.node_type = UBIFS_DENT_NODE;
+ dent_key_init_flash(c, &dent2->key, old_dir->i_ino,
+ &old_dentry->d_name);
+ dent2->inum = 0;
+ dent2->type = DT_UNKNOWN;
+ dent2->nlen = cpu_to_le16(old_dentry->d_name.len);
+ memcpy(dent2->name, old_dentry->d_name.name, old_dentry->d_name.len);
+ dent2->name[old_dentry->d_name.len] = '\0';
+ zero_dent_node_unused(dent2);
+ ubifs_prep_grp_node(c, dent2, dlen2, 0);
+
+ p = (void *)dent2 + aligned_dlen2;
+ if (new_inode) {
+ pack_inode(c, p, new_inode, 0);
+ p += ALIGN(ilen, 8);
+ }
+
+ if (!move)
+ pack_inode(c, p, old_dir, 1);
+ else {
+ pack_inode(c, p, old_dir, 0);
+ p += ALIGN(plen, 8);
+ pack_inode(c, p, new_dir, 1);
+ }
+
+ if (last_reference) {
+ err = ubifs_add_orphan(c, new_inode->i_ino);
+ if (err) {
+ release_head(c, BASEHD);
+ goto out_finish;
+ }
+ new_ui->del_cmtno = c->cmt_no;
+ }
+
+ err = write_head(c, BASEHD, dent, len, &lnum, &offs, sync);
+ if (err)
+ goto out_release;
+ if (!sync) {
+ struct ubifs_wbuf *wbuf = &c->jheads[BASEHD].wbuf;
+
+ ubifs_wbuf_add_ino_nolock(wbuf, new_dir->i_ino);
+ ubifs_wbuf_add_ino_nolock(wbuf, old_dir->i_ino);
+ if (new_inode)
+ ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf,
+ new_inode->i_ino);
+ }
+ release_head(c, BASEHD);
+
+ dent_key_init(c, &key, new_dir->i_ino, &new_dentry->d_name);
+ err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, &new_dentry->d_name);
+ if (err)
+ goto out_ro;
+
+ err = ubifs_add_dirt(c, lnum, dlen2);
+ if (err)
+ goto out_ro;
+
+ dent_key_init(c, &key, old_dir->i_ino, &old_dentry->d_name);
+ err = ubifs_tnc_remove_nm(c, &key, &old_dentry->d_name);
+ if (err)
+ goto out_ro;
+
+ offs += aligned_dlen1 + aligned_dlen2;
+ if (new_inode) {
+ ino_key_init(c, &key, new_inode->i_ino);
+ err = ubifs_tnc_add(c, &key, lnum, offs, ilen);
+ if (err)
+ goto out_ro;
+ offs += ALIGN(ilen, 8);
+ }
+
+ ino_key_init(c, &key, old_dir->i_ino);
+ err = ubifs_tnc_add(c, &key, lnum, offs, plen);
+ if (err)
+ goto out_ro;
+
+ if (old_dir != new_dir) {
+ offs += ALIGN(plen, 8);
+ ino_key_init(c, &key, new_dir->i_ino);
+ err = ubifs_tnc_add(c, &key, lnum, offs, plen);
+ if (err)
+ goto out_ro;
+ }
+
+ finish_reservation(c);
+ if (new_inode) {
+ mark_inode_clean(c, new_ui);
+ spin_lock(&new_ui->ui_lock);
+ new_ui->synced_i_size = new_ui->ui_size;
+ spin_unlock(&new_ui->ui_lock);
+ }
+ mark_inode_clean(c, ubifs_inode(old_dir));
+ if (move)
+ mark_inode_clean(c, ubifs_inode(new_dir));
+ kfree(dent);
+ return 0;
+
+out_release:
+ release_head(c, BASEHD);
+out_ro:
+ ubifs_ro_mode(c, err);
+ if (last_reference)
+ ubifs_delete_orphan(c, new_inode->i_ino);
+out_finish:
+ finish_reservation(c);
+out_free:
+ kfree(dent);
+ return err;
+}
+
+/**
+ * recomp_data_node - re-compress a truncated data node.
+ * @dn: data node to re-compress
+ * @new_len: new length
+ *
+ * This function is used when an inode is truncated and the last data node of
+ * the inode has to be re-compressed and re-written.
+ */
+static int recomp_data_node(struct ubifs_data_node *dn, int *new_len)
+{
+ void *buf;
+ int err, len, compr_type, out_len;
+
+ out_len = le32_to_cpu(dn->size);
+ buf = kmalloc(out_len * WORST_COMPR_FACTOR, GFP_NOFS);
+ if (!buf)
+ return -ENOMEM;
+
+ len = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
+ compr_type = le16_to_cpu(dn->compr_type);
+ err = ubifs_decompress(&dn->data, len, buf, &out_len, compr_type);
+ if (err)
+ goto out;
+
+ ubifs_compress(buf, *new_len, &dn->data, &out_len, &compr_type);
+ ubifs_assert(out_len <= UBIFS_BLOCK_SIZE);
+ dn->compr_type = cpu_to_le16(compr_type);
+ dn->size = cpu_to_le32(*new_len);
+ *new_len = UBIFS_DATA_NODE_SZ + out_len;
+out:
+ kfree(buf);
+ return err;
+}
+
+/**
+ * ubifs_jnl_truncate - update the journal for a truncation.
+ * @c: UBIFS file-system description object
+ * @inode: inode to truncate
+ * @old_size: old size
+ * @new_size: new size
+ *
+ * When the size of a file decreases due to truncation, a truncation node is
+ * written, the journal tree is updated, and the last data block is re-written
+ * if it has been affected. The inode is also updated in order to synchronize
+ * the new inode size.
+ *
+ * This function marks the inode as clean and returns zero on success. In case
+ * of failure, a negative error code is returned.
+ */
+int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
+ loff_t old_size, loff_t new_size)
+{
+ union ubifs_key key, to_key;
+ struct ubifs_ino_node *ino;
+ struct ubifs_trun_node *trun;
+ struct ubifs_data_node *uninitialized_var(dn);
+ int err, dlen, len, lnum, offs, bit, sz, sync = IS_SYNC(inode);
+ struct ubifs_inode *ui = ubifs_inode(inode);
+ ino_t inum = inode->i_ino;
+ unsigned int blk;
+
+ dbg_jnl("ino %lu, size %lld -> %lld", inum, old_size, new_size);
+ ubifs_assert(!ui->data_len);
+ ubifs_assert(S_ISREG(inode->i_mode));
+ ubifs_assert(mutex_is_locked(&ui->ui_mutex));
+
+ sz = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ +
+ UBIFS_MAX_DATA_NODE_SZ * WORST_COMPR_FACTOR;
+ ino = kmalloc(sz, GFP_NOFS);
+ if (!ino)
+ return -ENOMEM;
+
+ trun = (void *)ino + UBIFS_INO_NODE_SZ;
+ trun->ch.node_type = UBIFS_TRUN_NODE;
+ trun->inum = cpu_to_le32(inum);
+ trun->old_size = cpu_to_le64(old_size);
+ trun->new_size = cpu_to_le64(new_size);
+ zero_trun_node_unused(trun);
+
+ dlen = new_size & (UBIFS_BLOCK_SIZE - 1);
+ if (dlen) {
+ /* Get last data block so it can be truncated */
+ dn = (void *)trun + UBIFS_TRUN_NODE_SZ;
+ blk = new_size >> UBIFS_BLOCK_SHIFT;
+ data_key_init(c, &key, inum, blk);
+ dbg_jnl("last block key %s", DBGKEY(&key));
+ err = ubifs_tnc_lookup(c, &key, dn);
+ if (err == -ENOENT)
+ dlen = 0; /* Not found (so it is a hole) */
+ else if (err)
+ goto out_free;
+ else {
+ if (le32_to_cpu(dn->size) <= dlen)
+ dlen = 0; /* Nothing to do */
+ else {
+ int compr_type = le16_to_cpu(dn->compr_type);
+
+ if (compr_type != UBIFS_COMPR_NONE) {
+ err = recomp_data_node(dn, &dlen);
+ if (err)
+ goto out_free;
+ } else {
+ dn->size = cpu_to_le32(dlen);
+ dlen += UBIFS_DATA_NODE_SZ;
+ }
+ zero_data_node_unused(dn);
+ }
+ }
+ }
+
+ /* Must make reservation before allocating sequence numbers */
+ len = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ;
+ if (dlen)
+ len += dlen;
+ err = make_reservation(c, BASEHD, len);
+ if (err)
+ goto out_free;
+
+ pack_inode(c, ino, inode, 0);
+ ubifs_prep_grp_node(c, trun, UBIFS_TRUN_NODE_SZ, dlen ? 0 : 1);
+ if (dlen)
+ ubifs_prep_grp_node(c, dn, dlen, 1);
+
+ err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync);
+ if (err)
+ goto out_release;
+ if (!sync)
+ ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, inum);
+ release_head(c, BASEHD);
+
+ if (dlen) {
+ sz = offs + UBIFS_INO_NODE_SZ + UBIFS_TRUN_NODE_SZ;
+ err = ubifs_tnc_add(c, &key, lnum, sz, dlen);
+ if (err)
+ goto out_ro;
+ }
+
+ ino_key_init(c, &key, inum);
+ err = ubifs_tnc_add(c, &key, lnum, offs, UBIFS_INO_NODE_SZ);
+ if (err)
+ goto out_ro;
+
+ err = ubifs_add_dirt(c, lnum, UBIFS_TRUN_NODE_SZ);
+ if (err)
+ goto out_ro;
+
+ bit = new_size & (UBIFS_BLOCK_SIZE - 1);
+ blk = (new_size >> UBIFS_BLOCK_SHIFT) + (bit ? 1 : 0);
+ data_key_init(c, &key, inum, blk);
+
+ bit = old_size & (UBIFS_BLOCK_SIZE - 1);
+ blk = (old_size >> UBIFS_BLOCK_SHIFT) - (bit ? 0: 1);
+ data_key_init(c, &to_key, inum, blk);
+
+ err = ubifs_tnc_remove_range(c, &key, &to_key);
+ if (err)
+ goto out_ro;
+
+ finish_reservation(c);
+ spin_lock(&ui->ui_lock);
+ ui->synced_i_size = ui->ui_size;
+ spin_unlock(&ui->ui_lock);
+ mark_inode_clean(c, ui);
+ kfree(ino);
+ return 0;
+
+out_release:
+ release_head(c, BASEHD);
+out_ro:
+ ubifs_ro_mode(c, err);
+ finish_reservation(c);
+out_free:
+ kfree(ino);
+ return err;
+}
+
+#ifdef CONFIG_UBIFS_FS_XATTR
+
+/**
+ * ubifs_jnl_delete_xattr - delete an extended attribute.
+ * @c: UBIFS file-system description object
+ * @host: host inode
+ * @inode: extended attribute inode
+ * @nm: extended attribute entry name
+ *
+ * This function delete an extended attribute which is very similar to
+ * un-linking regular files - it writes a deletion xentry, a deletion inode and
+ * updates the target inode. Returns zero in case of success and a negative
+ * error code in case of failure.
+ */
+int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
+ const struct inode *inode, const struct qstr *nm)
+{
+ int err, xlen, hlen, len, lnum, xent_offs, aligned_xlen;
+ struct ubifs_dent_node *xent;
+ struct ubifs_ino_node *ino;
+ union ubifs_key xent_key, key1, key2;
+ int sync = IS_DIRSYNC(host);
+ struct ubifs_inode *host_ui = ubifs_inode(host);
+
+ dbg_jnl("host %lu, xattr ino %lu, name '%s', data len %d",
+ host->i_ino, inode->i_ino, nm->name,
+ ubifs_inode(inode)->data_len);
+ ubifs_assert(inode->i_nlink == 0);
+ ubifs_assert(mutex_is_locked(&host_ui->ui_mutex));
+
+ /*
+ * Since we are deleting the inode, we do not bother to attach any data
+ * to it and assume its length is %UBIFS_INO_NODE_SZ.
+ */
+ xlen = UBIFS_DENT_NODE_SZ + nm->len + 1;
+ aligned_xlen = ALIGN(xlen, 8);
+ hlen = host_ui->data_len + UBIFS_INO_NODE_SZ;
+ len = aligned_xlen + UBIFS_INO_NODE_SZ + ALIGN(hlen, 8);
+
+ xent = kmalloc(len, GFP_NOFS);
+ if (!xent)
+ return -ENOMEM;
+
+ /* Make reservation before allocating sequence numbers */
+ err = make_reservation(c, BASEHD, len);
+ if (err) {
+ kfree(xent);
+ return err;
+ }
+
+ xent->ch.node_type = UBIFS_XENT_NODE;
+ xent_key_init(c, &xent_key, host->i_ino, nm);
+ key_write(c, &xent_key, xent->key);
+ xent->inum = 0;
+ xent->type = get_dent_type(inode->i_mode);
+ xent->nlen = cpu_to_le16(nm->len);
+ memcpy(xent->name, nm->name, nm->len);
+ xent->name[nm->len] = '\0';
+ zero_dent_node_unused(xent);
+ ubifs_prep_grp_node(c, xent, xlen, 0);
+
+ ino = (void *)xent + aligned_xlen;
+ pack_inode(c, ino, inode, 0);
+ ino = (void *)ino + UBIFS_INO_NODE_SZ;
+ pack_inode(c, ino, host, 1);
+
+ err = write_head(c, BASEHD, xent, len, &lnum, &xent_offs, sync);
+ if (!sync && !err)
+ ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, host->i_ino);
+ release_head(c, BASEHD);
+ kfree(xent);
+ if (err)
+ goto out_ro;
+
+ /* Remove the extended attribute entry from TNC */
+ err = ubifs_tnc_remove_nm(c, &xent_key, nm);
+ if (err)
+ goto out_ro;
+ err = ubifs_add_dirt(c, lnum, xlen);
+ if (err)
+ goto out_ro;
+
+ /*
+ * Remove all nodes belonging to the extended attribute inode from TNC.
+ * Well, there actually must be only one node - the inode itself.
+ */
+ lowest_ino_key(c, &key1, inode->i_ino);
+ highest_ino_key(c, &key2, inode->i_ino);
+ err = ubifs_tnc_remove_range(c, &key1, &key2);
+ if (err)
+ goto out_ro;
+ err = ubifs_add_dirt(c, lnum, UBIFS_INO_NODE_SZ);
+ if (err)
+ goto out_ro;
+
+ /* And update TNC with the new host inode position */
+ ino_key_init(c, &key1, host->i_ino);
+ err = ubifs_tnc_add(c, &key1, lnum, xent_offs + len - hlen, hlen);
+ if (err)
+ goto out_ro;
+
+ finish_reservation(c);
+ spin_lock(&host_ui->ui_lock);
+ host_ui->synced_i_size = host_ui->ui_size;
+ spin_unlock(&host_ui->ui_lock);
+ mark_inode_clean(c, host_ui);
+ return 0;
+
+out_ro:
+ ubifs_ro_mode(c, err);
+ finish_reservation(c);
+ return err;
+}
+
+/**
+ * ubifs_jnl_change_xattr - change an extended attribute.
+ * @c: UBIFS file-system description object
+ * @inode: extended attribute inode
+ * @host: host inode
+ *
+ * This function writes the updated version of an extended attribute inode and
+ * the host inode tho the journal (to the base head). The host inode is written
+ * after the extended attribute inode in order to guarantee that the extended
+ * attribute will be flushed when the inode is synchronized by 'fsync()' and
+ * consequently, the write-buffer is synchronized. This function returns zero
+ * in case of success and a negative error code in case of failure.
+ */
+int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode,
+ const struct inode *host)
+{
+ int err, len1, len2, aligned_len, aligned_len1, lnum, offs;
+ struct ubifs_inode *host_ui = ubifs_inode(host);
+ struct ubifs_ino_node *ino;
+ union ubifs_key key;
+ int sync = IS_DIRSYNC(host);
+
+ dbg_jnl("ino %lu, ino %lu", host->i_ino, inode->i_ino);
+ ubifs_assert(host->i_nlink > 0);
+ ubifs_assert(inode->i_nlink > 0);
+ ubifs_assert(mutex_is_locked(&host_ui->ui_mutex));
+
+ len1 = UBIFS_INO_NODE_SZ + host_ui->data_len;
+ len2 = UBIFS_INO_NODE_SZ + ubifs_inode(inode)->data_len;
+ aligned_len1 = ALIGN(len1, 8);
+ aligned_len = aligned_len1 + ALIGN(len2, 8);
+
+ ino = kmalloc(aligned_len, GFP_NOFS);
+ if (!ino)
+ return -ENOMEM;
+
+ /* Make reservation before allocating sequence numbers */
+ err = make_reservation(c, BASEHD, aligned_len);
+ if (err)
+ goto out_free;
+
+ pack_inode(c, ino, host, 0);
+ pack_inode(c, (void *)ino + aligned_len1, inode, 1);
+
+ err = write_head(c, BASEHD, ino, aligned_len, &lnum, &offs, 0);
+ if (!sync && !err) {
+ struct ubifs_wbuf *wbuf = &c->jheads[BASEHD].wbuf;
+
+ ubifs_wbuf_add_ino_nolock(wbuf, host->i_ino);
+ ubifs_wbuf_add_ino_nolock(wbuf, inode->i_ino);
+ }
+ release_head(c, BASEHD);
+ if (err)
+ goto out_ro;
+
+ ino_key_init(c, &key, host->i_ino);
+ err = ubifs_tnc_add(c, &key, lnum, offs, len1);
+ if (err)
+ goto out_ro;
+
+ ino_key_init(c, &key, inode->i_ino);
+ err = ubifs_tnc_add(c, &key, lnum, offs + aligned_len1, len2);
+ if (err)
+ goto out_ro;
+
+ finish_reservation(c);
+ spin_lock(&host_ui->ui_lock);
+ host_ui->synced_i_size = host_ui->ui_size;
+ spin_unlock(&host_ui->ui_lock);
+ mark_inode_clean(c, host_ui);
+ kfree(ino);
+ return 0;
+
+out_ro:
+ ubifs_ro_mode(c, err);
+ finish_reservation(c);
+out_free:
+ kfree(ino);
+ return err;
+}
+
+#endif /* CONFIG_UBIFS_FS_XATTR */
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
new file mode 100644
index 00000000000..8f747600754
--- /dev/null
+++ b/fs/ubifs/key.h
@@ -0,0 +1,533 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ * Adrian Hunter
+ */
+
+/*
+ * This header contains various key-related definitions and helper function.
+ * UBIFS allows several key schemes, so we access key fields only via these
+ * helpers. At the moment only one key scheme is supported.
+ *
+ * Simple key scheme
+ * ~~~~~~~~~~~~~~~~~
+ *
+ * Keys are 64-bits long. First 32-bits are inode number (parent inode number
+ * in case of direntry key). Next 3 bits are node type. The last 29 bits are
+ * 4KiB offset in case of inode node, and direntry hash in case of a direntry
+ * node. We use "r5" hash borrowed from reiserfs.
+ */
+
+#ifndef __UBIFS_KEY_H__
+#define __UBIFS_KEY_H__
+
+/**
+ * key_r5_hash - R5 hash function (borrowed from reiserfs).
+ * @s: direntry name
+ * @len: name length
+ */
+static inline uint32_t key_r5_hash(const char *s, int len)
+{
+ uint32_t a = 0;
+ const signed char *str = (const signed char *)s;
+
+ while (*str) {
+ a += *str << 4;
+ a += *str >> 4;
+ a *= 11;
+ str++;
+ }
+
+ a &= UBIFS_S_KEY_HASH_MASK;
+
+ /*
+ * We use hash values as offset in directories, so values %0 and %1 are
+ * reserved for "." and "..". %2 is reserved for "end of readdir"
+ * marker.
+ */
+ if (unlikely(a >= 0 && a <= 2))
+ a += 3;
+ return a;
+}
+
+/**
+ * key_test_hash - testing hash function.
+ * @str: direntry name
+ * @len: name length
+ */
+static inline uint32_t key_test_hash(const char *str, int len)
+{
+ uint32_t a = 0;
+
+ len = min_t(uint32_t, len, 4);
+ memcpy(&a, str, len);
+ a &= UBIFS_S_KEY_HASH_MASK;
+ if (unlikely(a >= 0 && a <= 2))
+ a += 3;
+ return a;
+}
+
+/**
+ * ino_key_init - initialize inode key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: inode number
+ */
+static inline void ino_key_init(const struct ubifs_info *c,
+ union ubifs_key *key, ino_t inum)
+{
+ key->u32[0] = inum;
+ key->u32[1] = UBIFS_INO_KEY << UBIFS_S_KEY_BLOCK_BITS;
+}
+
+/**
+ * ino_key_init_flash - initialize on-flash inode key.
+ * @c: UBIFS file-system description object
+ * @k: key to initialize
+ * @inum: inode number
+ */
+static inline void ino_key_init_flash(const struct ubifs_info *c, void *k,
+ ino_t inum)
+{
+ union ubifs_key *key = k;
+
+ key->j32[0] = cpu_to_le32(inum);
+ key->j32[1] = cpu_to_le32(UBIFS_INO_KEY << UBIFS_S_KEY_BLOCK_BITS);
+ memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8);
+}
+
+/**
+ * lowest_ino_key - get the lowest possible inode key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: inode number
+ */
+static inline void lowest_ino_key(const struct ubifs_info *c,
+ union ubifs_key *key, ino_t inum)
+{
+ key->u32[0] = inum;
+ key->u32[1] = 0;
+}
+
+/**
+ * highest_ino_key - get the highest possible inode key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: inode number
+ */
+static inline void highest_ino_key(const struct ubifs_info *c,
+ union ubifs_key *key, ino_t inum)
+{
+ key->u32[0] = inum;
+ key->u32[1] = 0xffffffff;
+}
+
+/**
+ * dent_key_init - initialize directory entry key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: parent inode number
+ * @nm: direntry name and length
+ */
+static inline void dent_key_init(const struct ubifs_info *c,
+ union ubifs_key *key, ino_t inum,
+ const struct qstr *nm)
+{
+ uint32_t hash = c->key_hash(nm->name, nm->len);
+
+ ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
+ key->u32[0] = inum;
+ key->u32[1] = hash | (UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS);
+}
+
+/**
+ * dent_key_init_hash - initialize directory entry key without re-calculating
+ * hash function.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: parent inode number
+ * @hash: direntry name hash
+ */
+static inline void dent_key_init_hash(const struct ubifs_info *c,
+ union ubifs_key *key, ino_t inum,
+ uint32_t hash)
+{
+ ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
+ key->u32[0] = inum;
+ key->u32[1] = hash | (UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS);
+}
+
+/**
+ * dent_key_init_flash - initialize on-flash directory entry key.
+ * @c: UBIFS file-system description object
+ * @k: key to initialize
+ * @inum: parent inode number
+ * @nm: direntry name and length
+ */
+static inline void dent_key_init_flash(const struct ubifs_info *c, void *k,
+ ino_t inum, const struct qstr *nm)
+{
+ union ubifs_key *key = k;
+ uint32_t hash = c->key_hash(nm->name, nm->len);
+
+ ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
+ key->j32[0] = cpu_to_le32(inum);
+ key->j32[1] = cpu_to_le32(hash |
+ (UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS));
+ memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8);
+}
+
+/**
+ * lowest_dent_key - get the lowest possible directory entry key.
+ * @c: UBIFS file-system description object
+ * @key: where to store the lowest key
+ * @inum: parent inode number
+ */
+static inline void lowest_dent_key(const struct ubifs_info *c,
+ union ubifs_key *key, ino_t inum)
+{
+ key->u32[0] = inum;
+ key->u32[1] = UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS;
+}
+
+/**
+ * xent_key_init - initialize extended attribute entry key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: host inode number
+ * @nm: extended attribute entry name and length
+ */
+static inline void xent_key_init(const struct ubifs_info *c,
+ union ubifs_key *key, ino_t inum,
+ const struct qstr *nm)
+{
+ uint32_t hash = c->key_hash(nm->name, nm->len);
+
+ ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
+ key->u32[0] = inum;
+ key->u32[1] = hash | (UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS);
+}
+
+/**
+ * xent_key_init_hash - initialize extended attribute entry key without
+ * re-calculating hash function.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: host inode number
+ * @hash: extended attribute entry name hash
+ */
+static inline void xent_key_init_hash(const struct ubifs_info *c,
+ union ubifs_key *key, ino_t inum,
+ uint32_t hash)
+{
+ ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
+ key->u32[0] = inum;
+ key->u32[1] = hash | (UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS);
+}
+
+/**
+ * xent_key_init_flash - initialize on-flash extended attribute entry key.
+ * @c: UBIFS file-system description object
+ * @k: key to initialize
+ * @inum: host inode number
+ * @nm: extended attribute entry name and length
+ */
+static inline void xent_key_init_flash(const struct ubifs_info *c, void *k,
+ ino_t inum, const struct qstr *nm)
+{
+ union ubifs_key *key = k;
+ uint32_t hash = c->key_hash(nm->name, nm->len);
+
+ ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
+ key->j32[0] = cpu_to_le32(inum);
+ key->j32[1] = cpu_to_le32(hash |
+ (UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS));
+ memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8);
+}
+
+/**
+ * lowest_xent_key - get the lowest possible extended attribute entry key.
+ * @c: UBIFS file-system description object
+ * @key: where to store the lowest key
+ * @inum: host inode number
+ */
+static inline void lowest_xent_key(const struct ubifs_info *c,
+ union ubifs_key *key, ino_t inum)
+{
+ key->u32[0] = inum;
+ key->u32[1] = UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS;
+}
+
+/**
+ * data_key_init - initialize data key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: inode number
+ * @block: block number
+ */
+static inline void data_key_init(const struct ubifs_info *c,
+ union ubifs_key *key, ino_t inum,
+ unsigned int block)
+{
+ ubifs_assert(!(block & ~UBIFS_S_KEY_BLOCK_MASK));
+ key->u32[0] = inum;
+ key->u32[1] = block | (UBIFS_DATA_KEY << UBIFS_S_KEY_BLOCK_BITS);
+}
+
+/**
+ * data_key_init_flash - initialize on-flash data key.
+ * @c: UBIFS file-system description object
+ * @k: key to initialize
+ * @inum: inode number
+ * @block: block number
+ */
+static inline void data_key_init_flash(const struct ubifs_info *c, void *k,
+ ino_t inum, unsigned int block)
+{
+ union ubifs_key *key = k;
+
+ ubifs_assert(!(block & ~UBIFS_S_KEY_BLOCK_MASK));
+ key->j32[0] = cpu_to_le32(inum);
+ key->j32[1] = cpu_to_le32(block |
+ (UBIFS_DATA_KEY << UBIFS_S_KEY_BLOCK_BITS));
+ memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8);
+}
+
+/**
+ * trun_key_init - initialize truncation node key.
+ * @c: UBIFS file-system description object
+ * @key: key to initialize
+ * @inum: inode number
+ *
+ * Note, UBIFS does not have truncation keys on the media and this function is
+ * only used for purposes of replay.
+ */
+static inline void trun_key_init(const struct ubifs_info *c,
+ union ubifs_key *key, ino_t inum)
+{
+ key->u32[0] = inum;
+ key->u32[1] = UBIFS_TRUN_KEY << UBIFS_S_KEY_BLOCK_BITS;
+}
+
+/**
+ * key_type - get key type.
+ * @c: UBIFS file-system description object
+ * @key: key to get type of
+ */
+static inline int key_type(const struct ubifs_info *c,
+ const union ubifs_key *key)
+{
+ return key->u32[1] >> UBIFS_S_KEY_BLOCK_BITS;
+}
+
+/**
+ * key_type_flash - get type of a on-flash formatted key.
+ * @c: UBIFS file-system description object
+ * @k: key to get type of
+ */
+static inline int key_type_flash(const struct ubifs_info *c, const void *k)
+{
+ const union ubifs_key *key = k;
+
+ return le32_to_cpu(key->u32[1]) >> UBIFS_S_KEY_BLOCK_BITS;
+}
+
+/**
+ * key_inum - fetch inode number from key.
+ * @c: UBIFS file-system description object
+ * @k: key to fetch inode number from
+ */
+static inline ino_t key_inum(const struct ubifs_info *c, const void *k)
+{
+ const union ubifs_key *key = k;
+
+ return key->u32[0];
+}
+
+/**
+ * key_inum_flash - fetch inode number from an on-flash formatted key.
+ * @c: UBIFS file-system description object
+ * @k: key to fetch inode number from
+ */
+static inline ino_t key_inum_flash(const struct ubifs_info *c, const void *k)
+{
+ const union ubifs_key *key = k;
+
+ return le32_to_cpu(key->j32[0]);
+}
+
+/**
+ * key_hash - get directory entry hash.
+ * @c: UBIFS file-system description object
+ * @key: the key to get hash from
+ */
+static inline int key_hash(const struct ubifs_info *c,
+ const union ubifs_key *key)
+{
+ return key->u32[1] & UBIFS_S_KEY_HASH_MASK;
+}
+
+/**
+ * key_hash_flash - get directory entry hash from an on-flash formatted key.
+ * @c: UBIFS file-system description object
+ * @k: the key to get hash from
+ */
+static inline int key_hash_flash(const struct ubifs_info *c, const void *k)
+{
+ const union ubifs_key *key = k;
+
+ return le32_to_cpu(key->j32[1]) & UBIFS_S_KEY_HASH_MASK;
+}
+
+/**
+ * key_block - get data block number.
+ * @c: UBIFS file-system description object
+ * @key: the key to get the block number from
+ */
+static inline unsigned int key_block(const struct ubifs_info *c,
+ const union ubifs_key *key)
+{
+ return key->u32[1] & UBIFS_S_KEY_BLOCK_MASK;
+}
+
+/**
+ * key_block_flash - get data block number from an on-flash formatted key.
+ * @c: UBIFS file-system description object
+ * @k: the key to get the block number from
+ */
+static inline unsigned int key_block_flash(const struct ubifs_info *c,
+ const void *k)
+{
+ const union ubifs_key *key = k;
+
+ return le32_to_cpu(key->u32[1]) & UBIFS_S_KEY_BLOCK_MASK;
+}
+
+/**
+ * key_read - transform a key to in-memory format.
+ * @c: UBIFS file-system description object
+ * @from: the key to transform
+ * @to: the key to store the result
+ */
+static inline void key_read(const struct ubifs_info *c, const void *from,
+ union ubifs_key *to)
+{
+ const union ubifs_key *f = from;
+
+ to->u32[0] = le32_to_cpu(f->j32[0]);
+ to->u32[1] = le32_to_cpu(f->j32[1]);
+}
+
+/**
+ * key_write - transform a key from in-memory format.
+ * @c: UBIFS file-system description object
+ * @from: the key to transform
+ * @to: the key to store the result
+ */
+static inline void key_write(const struct ubifs_info *c,
+ const union ubifs_key *from, void *to)
+{
+ union ubifs_key *t = to;
+
+ t->j32[0] = cpu_to_le32(from->u32[0]);
+ t->j32[1] = cpu_to_le32(from->u32[1]);
+ memset(to + 8, 0, UBIFS_MAX_KEY_LEN - 8);
+}
+
+/**
+ * key_write_idx - transform a key from in-memory format for the index.
+ * @c: UBIFS file-system description object
+ * @from: the key to transform
+ * @to: the key to store the result
+ */
+static inline void key_write_idx(const struct ubifs_info *c,
+ const union ubifs_key *from, void *to)
+{
+ union ubifs_key *t = to;
+
+ t->j32[0] = cpu_to_le32(from->u32[0]);
+ t->j32[1] = cpu_to_le32(from->u32[1]);
+}
+
+/**
+ * key_copy - copy a key.
+ * @c: UBIFS file-system description object
+ * @from: the key to copy from
+ * @to: the key to copy to
+ */
+static inline void key_copy(const struct ubifs_info *c,
+ const union ubifs_key *from, union ubifs_key *to)
+{
+ to->u64[0] = from->u64[0];
+}
+
+/**
+ * keys_cmp - compare keys.
+ * @c: UBIFS file-system description object
+ * @key1: the first key to compare
+ * @key2: the second key to compare
+ *
+ * This function compares 2 keys and returns %-1 if @key1 is less than
+ * @key2, 0 if the keys are equivalent and %1 if @key1 is greater than @key2.
+ */
+static inline int keys_cmp(const struct ubifs_info *c,
+ const union ubifs_key *key1,
+ const union ubifs_key *key2)
+{
+ if (key1->u32[0] < key2->u32[0])
+ return -1;
+ if (key1->u32[0] > key2->u32[0])
+ return 1;
+ if (key1->u32[1] < key2->u32[1])
+ return -1;
+ if (key1->u32[1] > key2->u32[1])
+ return 1;
+
+ return 0;
+}
+
+/**
+ * is_hash_key - is a key vulnerable to hash collisions.
+ * @c: UBIFS file-system description object
+ * @key: key
+ *
+ * This function returns %1 if @key is a hashed key or %0 otherwise.
+ */
+static inline int is_hash_key(const struct ubifs_info *c,
+ const union ubifs_key *key)
+{
+ int type = key_type(c, key);
+
+ return type == UBIFS_DENT_KEY || type == UBIFS_XENT_KEY;
+}
+
+/**
+ * key_max_inode_size - get maximum file size allowed by current key format.
+ * @c: UBIFS file-system description object
+ */
+static inline unsigned long long key_max_inode_size(const struct ubifs_info *c)
+{
+ switch (c->key_fmt) {
+ case UBIFS_SIMPLE_KEY_FMT:
+ return (1ULL << UBIFS_S_KEY_BLOCK_BITS) * UBIFS_BLOCK_SIZE;
+ default:
+ return 0;
+ }
+}
+#endif /* !__UBIFS_KEY_H__ */
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
new file mode 100644
index 00000000000..3e0aa736755
--- /dev/null
+++ b/fs/ubifs/log.c
@@ -0,0 +1,807 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ * Adrian Hunter
+ */
+
+/*
+ * This file is a part of UBIFS journal implementation and contains various
+ * functions which manipulate the log. The log is a fixed area on the flash
+ * which does not contain any data but refers to buds. The log is a part of the
+ * journal.
+ */
+
+#include "ubifs.h"
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+static int dbg_check_bud_bytes(struct ubifs_info *c);
+#else
+#define dbg_check_bud_bytes(c) 0
+#endif
+
+/**
+ * ubifs_search_bud - search bud LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: logical eraseblock number to search
+ *
+ * This function searches bud LEB @lnum. Returns bud description object in case
+ * of success and %NULL if there is no bud with this LEB number.
+ */
+struct ubifs_bud *ubifs_search_bud(struct ubifs_info *c, int lnum)
+{
+ struct rb_node *p;
+ struct ubifs_bud *bud;
+
+ spin_lock(&c->buds_lock);
+ p = c->buds.rb_node;
+ while (p) {
+ bud = rb_entry(p, struct ubifs_bud, rb);
+ if (lnum < bud->lnum)
+ p = p->rb_left;
+ else if (lnum > bud->lnum)
+ p = p->rb_right;
+ else {
+ spin_unlock(&c->buds_lock);
+ return bud;
+ }
+ }
+ spin_unlock(&c->buds_lock);
+ return NULL;
+}
+
+/**
+ * ubifs_get_wbuf - get the wbuf associated with a LEB, if there is one.
+ * @c: UBIFS file-system description object
+ * @lnum: logical eraseblock number to search
+ *
+ * This functions returns the wbuf for @lnum or %NULL if there is not one.
+ */
+struct ubifs_wbuf *ubifs_get_wbuf(struct ubifs_info *c, int lnum)
+{
+ struct rb_node *p;
+ struct ubifs_bud *bud;
+ int jhead;
+
+ if (!c->jheads)
+ return NULL;
+
+ spin_lock(&c->buds_lock);
+ p = c->buds.rb_node;
+ while (p) {
+ bud = rb_entry(p, struct ubifs_bud, rb);
+ if (lnum < bud->lnum)
+ p = p->rb_left;
+ else if (lnum > bud->lnum)
+ p = p->rb_right;
+ else {
+ jhead = bud->jhead;
+ spin_unlock(&c->buds_lock);
+ return &c->jheads[jhead].wbuf;
+ }
+ }
+ spin_unlock(&c->buds_lock);
+ return NULL;
+}
+
+/**
+ * next_log_lnum - switch to the next log LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: current log LEB
+ */
+static inline int next_log_lnum(const struct ubifs_info *c, int lnum)
+{
+ lnum += 1;
+ if (lnum > c->log_last)
+ lnum = UBIFS_LOG_LNUM;
+
+ return lnum;
+}
+
+/**
+ * empty_log_bytes - calculate amount of empty space in the log.
+ * @c: UBIFS file-system description object
+ */
+static inline long long empty_log_bytes(const struct ubifs_info *c)
+{
+ long long h, t;
+
+ h = (long long)c->lhead_lnum * c->leb_size + c->lhead_offs;
+ t = (long long)c->ltail_lnum * c->leb_size;
+
+ if (h >= t)
+ return c->log_bytes - h + t;
+ else
+ return t - h;
+}
+
+/**
+ * ubifs_add_bud - add bud LEB to the tree of buds and its journal head list.
+ * @c: UBIFS file-system description object
+ * @bud: the bud to add
+ */
+void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud)
+{
+ struct rb_node **p, *parent = NULL;
+ struct ubifs_bud *b;
+ struct ubifs_jhead *jhead;
+
+ spin_lock(&c->buds_lock);
+ p = &c->buds.rb_node;
+ while (*p) {
+ parent = *p;
+ b = rb_entry(parent, struct ubifs_bud, rb);
+ ubifs_assert(bud->lnum != b->lnum);
+ if (bud->lnum < b->lnum)
+ p = &(*p)->rb_left;
+ else
+ p = &(*p)->rb_right;
+ }
+
+ rb_link_node(&bud->rb, parent, p);
+ rb_insert_color(&bud->rb, &c->buds);
+ if (c->jheads) {
+ jhead = &c->jheads[bud->jhead];
+ list_add_tail(&bud->list, &jhead->buds_list);
+ } else
+ ubifs_assert(c->replaying && (c->vfs_sb->s_flags & MS_RDONLY));
+
+ /*
+ * Note, although this is a new bud, we anyway account this space now,
+ * before any data has been written to it, because this is about to
+ * guarantee fixed mount time, and this bud will anyway be read and
+ * scanned.
+ */
+ c->bud_bytes += c->leb_size - bud->start;
+
+ dbg_log("LEB %d:%d, jhead %d, bud_bytes %lld", bud->lnum,
+ bud->start, bud->jhead, c->bud_bytes);
+ spin_unlock(&c->buds_lock);
+}
+
+/**
+ * ubifs_create_buds_lists - create journal head buds lists for remount rw.
+ * @c: UBIFS file-system description object
+ */
+void ubifs_create_buds_lists(struct ubifs_info *c)
+{
+ struct rb_node *p;
+
+ spin_lock(&c->buds_lock);
+ p = rb_first(&c->buds);
+ while (p) {
+ struct ubifs_bud *bud = rb_entry(p, struct ubifs_bud, rb);
+ struct ubifs_jhead *jhead = &c->jheads[bud->jhead];
+
+ list_add_tail(&bud->list, &jhead->buds_list);
+ p = rb_next(p);
+ }
+ spin_unlock(&c->buds_lock);
+}
+
+/**
+ * ubifs_add_bud_to_log - add a new bud to the log.
+ * @c: UBIFS file-system description object
+ * @jhead: journal head the bud belongs to
+ * @lnum: LEB number of the bud
+ * @offs: starting offset of the bud
+ *
+ * This function writes reference node for the new bud LEB @lnum it to the log,
+ * and adds it to the buds tress. It also makes sure that log size does not
+ * exceed the 'c->max_bud_bytes' limit. Returns zero in case of success,
+ * %-EAGAIN if commit is required, and a negative error codes in case of
+ * failure.
+ */
+int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
+{
+ int err;
+ struct ubifs_bud *bud;
+ struct ubifs_ref_node *ref;
+
+ bud = kmalloc(sizeof(struct ubifs_bud), GFP_NOFS);
+ if (!bud)
+ return -ENOMEM;
+ ref = kzalloc(c->ref_node_alsz, GFP_NOFS);
+ if (!ref) {
+ kfree(bud);
+ return -ENOMEM;
+ }
+
+ mutex_lock(&c->log_mutex);
+
+ if (c->ro_media) {
+ err = -EROFS;
+ goto out_unlock;
+ }
+
+ /* Make sure we have enough space in the log */
+ if (empty_log_bytes(c) - c->ref_node_alsz < c->min_log_bytes) {
+ dbg_log("not enough log space - %lld, required %d",
+ empty_log_bytes(c), c->min_log_bytes);
+ ubifs_commit_required(c);
+ err = -EAGAIN;
+ goto out_unlock;
+ }
+
+ /*
+ * Make sure the the amount of space in buds will not exceed
+ * 'c->max_bud_bytes' limit, because we want to guarantee mount time
+ * limits.
+ *
+ * It is not necessary to hold @c->buds_lock when reading @c->bud_bytes
+ * because we are holding @c->log_mutex. All @c->bud_bytes take place
+ * when both @c->log_mutex and @c->bud_bytes are locked.
+ */
+ if (c->bud_bytes + c->leb_size - offs > c->max_bud_bytes) {
+ dbg_log("bud bytes %lld (%lld max), require commit",
+ c->bud_bytes, c->max_bud_bytes);
+ ubifs_commit_required(c);
+ err = -EAGAIN;
+ goto out_unlock;
+ }
+
+ /*
+ * If the journal is full enough - start background commit. Note, it is
+ * OK to read 'c->cmt_state' without spinlock because integer reads
+ * are atomic in the kernel.
+ */
+ if (c->bud_bytes >= c->bg_bud_bytes &&
+ c->cmt_state == COMMIT_RESTING) {
+ dbg_log("bud bytes %lld (%lld max), initiate BG commit",
+ c->bud_bytes, c->max_bud_bytes);
+ ubifs_request_bg_commit(c);
+ }
+
+ bud->lnum = lnum;
+ bud->start = offs;
+ bud->jhead = jhead;
+
+ ref->ch.node_type = UBIFS_REF_NODE;
+ ref->lnum = cpu_to_le32(bud->lnum);
+ ref->offs = cpu_to_le32(bud->start);
+ ref->jhead = cpu_to_le32(jhead);
+
+ if (c->lhead_offs > c->leb_size - c->ref_node_alsz) {
+ c->lhead_lnum = next_log_lnum(c, c->lhead_lnum);
+ c->lhead_offs = 0;
+ }
+
+ if (c->lhead_offs == 0) {
+ /* Must ensure next log LEB has been unmapped */
+ err = ubifs_leb_unmap(c, c->lhead_lnum);
+ if (err)
+ goto out_unlock;
+ }
+
+ if (bud->start == 0) {
+ /*
+ * Before writing the LEB reference which refers an empty LEB
+ * to the log, we have to make sure it is mapped, because
+ * otherwise we'd risk to refer an LEB with garbage in case of
+ * an unclean reboot, because the target LEB might have been
+ * unmapped, but not yet physically erased.
+ */
+ err = ubi_leb_map(c->ubi, bud->lnum, UBI_SHORTTERM);
+ if (err)
+ goto out_unlock;
+ }
+
+ dbg_log("write ref LEB %d:%d",
+ c->lhead_lnum, c->lhead_offs);
+ err = ubifs_write_node(c, ref, UBIFS_REF_NODE_SZ, c->lhead_lnum,
+ c->lhead_offs, UBI_SHORTTERM);
+ if (err)
+ goto out_unlock;
+
+ c->lhead_offs += c->ref_node_alsz;
+
+ ubifs_add_bud(c, bud);
+
+ mutex_unlock(&c->log_mutex);
+ kfree(ref);
+ return 0;
+
+out_unlock:
+ if (err != -EAGAIN)
+ ubifs_ro_mode(c, err);
+ mutex_unlock(&c->log_mutex);
+ kfree(ref);
+ kfree(bud);
+ return err;
+}
+
+/**
+ * remove_buds - remove used buds.
+ * @c: UBIFS file-system description object
+ *
+ * This function removes use buds from the buds tree. It does not remove the
+ * buds which are pointed to by journal heads.
+ */
+static void remove_buds(struct ubifs_info *c)
+{
+ struct rb_node *p;
+
+ ubifs_assert(list_empty(&c->old_buds));
+ c->cmt_bud_bytes = 0;
+ spin_lock(&c->buds_lock);
+ p = rb_first(&c->buds);
+ while (p) {
+ struct rb_node *p1 = p;
+ struct ubifs_bud *bud;
+ struct ubifs_wbuf *wbuf;
+
+ p = rb_next(p);
+ bud = rb_entry(p1, struct ubifs_bud, rb);
+ wbuf = &c->jheads[bud->jhead].wbuf;
+
+ if (wbuf->lnum == bud->lnum) {
+ /*
+ * Do not remove buds which are pointed to by journal
+ * heads (non-closed buds).
+ */
+ c->cmt_bud_bytes += wbuf->offs - bud->start;
+ dbg_log("preserve %d:%d, jhead %d, bud bytes %d, "
+ "cmt_bud_bytes %lld", bud->lnum, bud->start,
+ bud->jhead, wbuf->offs - bud->start,
+ c->cmt_bud_bytes);
+ bud->start = wbuf->offs;
+ } else {
+ c->cmt_bud_bytes += c->leb_size - bud->start;
+ dbg_log("remove %d:%d, jhead %d, bud bytes %d, "
+ "cmt_bud_bytes %lld", bud->lnum, bud->start,
+ bud->jhead, c->leb_size - bud->start,
+ c->cmt_bud_bytes);
+ rb_erase(p1, &c->buds);
+ list_del(&bud->list);
+ /*
+ * If the commit does not finish, the recovery will need
+ * to replay the journal, in which case the old buds
+ * must be unchanged. Do not release them until post
+ * commit i.e. do not allow them to be garbage
+ * collected.
+ */
+ list_add(&bud->list, &c->old_buds);
+ }
+ }
+ spin_unlock(&c->buds_lock);
+}
+
+/**
+ * ubifs_log_start_commit - start commit.
+ * @c: UBIFS file-system description object
+ * @ltail_lnum: return new log tail LEB number
+ *
+ * The commit operation starts with writing "commit start" node to the log and
+ * reference nodes for all journal heads which will define new journal after
+ * the commit has been finished. The commit start and reference nodes are
+ * written in one go to the nearest empty log LEB (hence, when commit is
+ * finished UBIFS may safely unmap all the previous log LEBs). This function
+ * returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
+{
+ void *buf;
+ struct ubifs_cs_node *cs;
+ struct ubifs_ref_node *ref;
+ int err, i, max_len, len;
+
+ err = dbg_check_bud_bytes(c);
+ if (err)
+ return err;
+
+ max_len = UBIFS_CS_NODE_SZ + c->jhead_cnt * UBIFS_REF_NODE_SZ;
+ max_len = ALIGN(max_len, c->min_io_size);
+ buf = cs = kmalloc(max_len, GFP_NOFS);
+ if (!buf)
+ return -ENOMEM;
+
+ cs->ch.node_type = UBIFS_CS_NODE;
+ cs->cmt_no = cpu_to_le64(c->cmt_no);
+ ubifs_prepare_node(c, cs, UBIFS_CS_NODE_SZ, 0);
+
+ /*
+ * Note, we do not lock 'c->log_mutex' because this is the commit start
+ * phase and we are exclusively using the log. And we do not lock
+ * write-buffer because nobody can write to the file-system at this
+ * phase.
+ */
+
+ len = UBIFS_CS_NODE_SZ;
+ for (i = 0; i < c->jhead_cnt; i++) {
+ int lnum = c->jheads[i].wbuf.lnum;
+ int offs = c->jheads[i].wbuf.offs;
+
+ if (lnum == -1 || offs == c->leb_size)
+ continue;
+
+ dbg_log("add ref to LEB %d:%d for jhead %d", lnum, offs, i);
+ ref = buf + len;
+ ref->ch.node_type = UBIFS_REF_NODE;
+ ref->lnum = cpu_to_le32(lnum);
+ ref->offs = cpu_to_le32(offs);
+ ref->jhead = cpu_to_le32(i);
+
+ ubifs_prepare_node(c, ref, UBIFS_REF_NODE_SZ, 0);
+ len += UBIFS_REF_NODE_SZ;
+ }
+
+ ubifs_pad(c, buf + len, ALIGN(len, c->min_io_size) - len);
+
+ /* Switch to the next log LEB */
+ if (c->lhead_offs) {
+ c->lhead_lnum = next_log_lnum(c, c->lhead_lnum);
+ c->lhead_offs = 0;
+ }
+
+ if (c->lhead_offs == 0) {
+ /* Must ensure next LEB has been unmapped */
+ err = ubifs_leb_unmap(c, c->lhead_lnum);
+ if (err)
+ goto out;
+ }
+
+ len = ALIGN(len, c->min_io_size);
+ dbg_log("writing commit start at LEB %d:0, len %d", c->lhead_lnum, len);
+ err = ubifs_leb_write(c, c->lhead_lnum, cs, 0, len, UBI_SHORTTERM);
+ if (err)
+ goto out;
+
+ *ltail_lnum = c->lhead_lnum;
+
+ c->lhead_offs += len;
+ if (c->lhead_offs == c->leb_size) {
+ c->lhead_lnum = next_log_lnum(c, c->lhead_lnum);
+ c->lhead_offs = 0;
+ }
+
+ remove_buds(c);
+
+ /*
+ * We have started the commit and now users may use the rest of the log
+ * for new writes.
+ */
+ c->min_log_bytes = 0;
+
+out:
+ kfree(buf);
+ return err;
+}
+
+/**
+ * ubifs_log_end_commit - end commit.
+ * @c: UBIFS file-system description object
+ * @ltail_lnum: new log tail LEB number
+ *
+ * This function is called on when the commit operation was finished. It
+ * moves log tail to new position and unmaps LEBs which contain obsolete data.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+int ubifs_log_end_commit(struct ubifs_info *c, int ltail_lnum)
+{
+ int err;
+
+ /*
+ * At this phase we have to lock 'c->log_mutex' because UBIFS allows FS
+ * writes during commit. Its only short "commit" start phase when
+ * writers are blocked.
+ */
+ mutex_lock(&c->log_mutex);
+
+ dbg_log("old tail was LEB %d:0, new tail is LEB %d:0",
+ c->ltail_lnum, ltail_lnum);
+
+ c->ltail_lnum = ltail_lnum;
+ /*
+ * The commit is finished and from now on it must be guaranteed that
+ * there is always enough space for the next commit.
+ */
+ c->min_log_bytes = c->leb_size;
+
+ spin_lock(&c->buds_lock);
+ c->bud_bytes -= c->cmt_bud_bytes;
+ spin_unlock(&c->buds_lock);
+
+ err = dbg_check_bud_bytes(c);
+
+ mutex_unlock(&c->log_mutex);
+ return err;
+}
+
+/**
+ * ubifs_log_post_commit - things to do after commit is completed.
+ * @c: UBIFS file-system description object
+ * @old_ltail_lnum: old log tail LEB number
+ *
+ * Release buds only after commit is completed, because they must be unchanged
+ * if recovery is needed.
+ *
+ * Unmap log LEBs only after commit is completed, because they may be needed for
+ * recovery.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_log_post_commit(struct ubifs_info *c, int old_ltail_lnum)
+{
+ int lnum, err = 0;
+
+ while (!list_empty(&c->old_buds)) {
+ struct ubifs_bud *bud;
+
+ bud = list_entry(c->old_buds.next, struct ubifs_bud, list);
+ err = ubifs_return_leb(c, bud->lnum);
+ if (err)
+ return err;
+ list_del(&bud->list);
+ kfree(bud);
+ }
+ mutex_lock(&c->log_mutex);
+ for (lnum = old_ltail_lnum; lnum != c->ltail_lnum;
+ lnum = next_log_lnum(c, lnum)) {
+ dbg_log("unmap log LEB %d", lnum);
+ err = ubifs_leb_unmap(c, lnum);
+ if (err)
+ goto out;
+ }
+out:
+ mutex_unlock(&c->log_mutex);
+ return err;
+}
+
+/**
+ * struct done_ref - references that have been done.
+ * @rb: rb-tree node
+ * @lnum: LEB number
+ */
+struct done_ref {
+ struct rb_node rb;
+ int lnum;
+};
+
+/**
+ * done_already - determine if a reference has been done already.
+ * @done_tree: rb-tree to store references that have been done
+ * @lnum: LEB number of reference
+ *
+ * This function returns %1 if the reference has been done, %0 if not, otherwise
+ * a negative error code is returned.
+ */
+static int done_already(struct rb_root *done_tree, int lnum)
+{
+ struct rb_node **p = &done_tree->rb_node, *parent = NULL;
+ struct done_ref *dr;
+
+ while (*p) {
+ parent = *p;
+ dr = rb_entry(parent, struct done_ref, rb);
+ if (lnum < dr->lnum)
+ p = &(*p)->rb_left;
+ else if (lnum > dr->lnum)
+ p = &(*p)->rb_right;
+ else
+ return 1;
+ }
+
+ dr = kzalloc(sizeof(struct done_ref), GFP_NOFS);
+ if (!dr)
+ return -ENOMEM;
+
+ dr->lnum = lnum;
+
+ rb_link_node(&dr->rb, parent, p);
+ rb_insert_color(&dr->rb, done_tree);
+
+ return 0;
+}
+
+/**
+ * destroy_done_tree - destroy the done tree.
+ * @done_tree: done tree to destroy
+ */
+static void destroy_done_tree(struct rb_root *done_tree)
+{
+ struct rb_node *this = done_tree->rb_node;
+ struct done_ref *dr;
+
+ while (this) {
+ if (this->rb_left) {
+ this = this->rb_left;
+ continue;
+ } else if (this->rb_right) {
+ this = this->rb_right;
+ continue;
+ }
+ dr = rb_entry(this, struct done_ref, rb);
+ this = rb_parent(this);
+ if (this) {
+ if (this->rb_left == &dr->rb)
+ this->rb_left = NULL;
+ else
+ this->rb_right = NULL;
+ }
+ kfree(dr);
+ }
+}
+
+/**
+ * add_node - add a node to the consolidated log.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to which to add
+ * @lnum: LEB number to which to write is passed and returned here
+ * @offs: offset to where to write is passed and returned here
+ * @node: node to add
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int add_node(struct ubifs_info *c, void *buf, int *lnum, int *offs,
+ void *node)
+{
+ struct ubifs_ch *ch = node;
+ int len = le32_to_cpu(ch->len), remains = c->leb_size - *offs;
+
+ if (len > remains) {
+ int sz = ALIGN(*offs, c->min_io_size), err;
+
+ ubifs_pad(c, buf + *offs, sz - *offs);
+ err = ubifs_leb_change(c, *lnum, buf, sz, UBI_SHORTTERM);
+ if (err)
+ return err;
+ *lnum = next_log_lnum(c, *lnum);
+ *offs = 0;
+ }
+ memcpy(buf + *offs, node, len);
+ *offs += ALIGN(len, 8);
+ return 0;
+}
+
+/**
+ * ubifs_consolidate_log - consolidate the log.
+ * @c: UBIFS file-system description object
+ *
+ * Repeated failed commits could cause the log to be full, but at least 1 LEB is
+ * needed for commit. This function rewrites the reference nodes in the log
+ * omitting duplicates, and failed CS nodes, and leaving no gaps.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_consolidate_log(struct ubifs_info *c)
+{
+ struct ubifs_scan_leb *sleb;
+ struct ubifs_scan_node *snod;
+ struct rb_root done_tree = RB_ROOT;
+ int lnum, err, first = 1, write_lnum, offs = 0;
+ void *buf;
+
+ dbg_rcvry("log tail LEB %d, log head LEB %d", c->ltail_lnum,
+ c->lhead_lnum);
+ buf = vmalloc(c->leb_size);
+ if (!buf)
+ return -ENOMEM;
+ lnum = c->ltail_lnum;
+ write_lnum = lnum;
+ while (1) {
+ sleb = ubifs_scan(c, lnum, 0, c->sbuf);
+ if (IS_ERR(sleb)) {
+ err = PTR_ERR(sleb);
+ goto out_free;
+ }
+ list_for_each_entry(snod, &sleb->nodes, list) {
+ switch (snod->type) {
+ case UBIFS_REF_NODE: {
+ struct ubifs_ref_node *ref = snod->node;
+ int ref_lnum = le32_to_cpu(ref->lnum);
+
+ err = done_already(&done_tree, ref_lnum);
+ if (err < 0)
+ goto out_scan;
+ if (err != 1) {
+ err = add_node(c, buf, &write_lnum,
+ &offs, snod->node);
+ if (err)
+ goto out_scan;
+ }
+ break;
+ }
+ case UBIFS_CS_NODE:
+ if (!first)
+ break;
+ err = add_node(c, buf, &write_lnum, &offs,
+ snod->node);
+ if (err)
+ goto out_scan;
+ first = 0;
+ break;
+ }
+ }
+ ubifs_scan_destroy(sleb);
+ if (lnum == c->lhead_lnum)
+ break;
+ lnum = next_log_lnum(c, lnum);
+ }
+ if (offs) {
+ int sz = ALIGN(offs, c->min_io_size);
+
+ ubifs_pad(c, buf + offs, sz - offs);
+ err = ubifs_leb_change(c, write_lnum, buf, sz, UBI_SHORTTERM);
+ if (err)
+ goto out_free;
+ offs = ALIGN(offs, c->min_io_size);
+ }
+ destroy_done_tree(&done_tree);
+ vfree(buf);
+ if (write_lnum == c->lhead_lnum) {
+ ubifs_err("log is too full");
+ return -EINVAL;
+ }
+ /* Unmap remaining LEBs */
+ lnum = write_lnum;
+ do {
+ lnum = next_log_lnum(c, lnum);
+ err = ubifs_leb_unmap(c, lnum);
+ if (err)
+ return err;
+ } while (lnum != c->lhead_lnum);
+ c->lhead_lnum = write_lnum;
+ c->lhead_offs = offs;
+ dbg_rcvry("new log head at %d:%d", c->lhead_lnum, c->lhead_offs);
+ return 0;
+
+out_scan:
+ ubifs_scan_destroy(sleb);
+out_free:
+ destroy_done_tree(&done_tree);
+ vfree(buf);
+ return err;
+}
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+
+/**
+ * dbg_check_bud_bytes - make sure bud bytes calculation are all right.
+ * @c: UBIFS file-system description object
+ *
+ * This function makes sure the amount of flash space used by closed buds
+ * ('c->bud_bytes' is correct). Returns zero in case of success and %-EINVAL in
+ * case of failure.
+ */
+static int dbg_check_bud_bytes(struct ubifs_info *c)
+{
+ int i, err = 0;
+ struct ubifs_bud *bud;
+ long long bud_bytes = 0;
+
+ if (!(ubifs_chk_flags & UBIFS_CHK_GEN))
+ return 0;
+
+ spin_lock(&c->buds_lock);
+ for (i = 0; i < c->jhead_cnt; i++)
+ list_for_each_entry(bud, &c->jheads[i].buds_list, list)
+ bud_bytes += c->leb_size - bud->start;
+
+ if (c->bud_bytes != bud_bytes) {
+ ubifs_err("bad bud_bytes %lld, calculated %lld",
+ c->bud_bytes, bud_bytes);
+ err = -EINVAL;
+ }
+ spin_unlock(&c->buds_lock);
+
+ return err;
+}
+
+#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
new file mode 100644
index 00000000000..2ba93da71b6
--- /dev/null
+++ b/fs/ubifs/lprops.c
@@ -0,0 +1,1357 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ * Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements the functions that access LEB properties and their
+ * categories. LEBs are categorized based on the needs of UBIFS, and the
+ * categories are stored as either heaps or lists to provide a fast way of
+ * finding a LEB in a particular category. For example, UBIFS may need to find
+ * an empty LEB for the journal, or a very dirty LEB for garbage collection.
+ */
+
+#include "ubifs.h"
+
+/**
+ * get_heap_comp_val - get the LEB properties value for heap comparisons.
+ * @lprops: LEB properties
+ * @cat: LEB category
+ */
+static int get_heap_comp_val(struct ubifs_lprops *lprops, int cat)
+{
+ switch (cat) {
+ case LPROPS_FREE:
+ return lprops->free;
+ case LPROPS_DIRTY_IDX:
+ return lprops->free + lprops->dirty;
+ default:
+ return lprops->dirty;
+ }
+}
+
+/**
+ * move_up_lpt_heap - move a new heap entry up as far as possible.
+ * @c: UBIFS file-system description object
+ * @heap: LEB category heap
+ * @lprops: LEB properties to move
+ * @cat: LEB category
+ *
+ * New entries to a heap are added at the bottom and then moved up until the
+ * parent's value is greater. In the case of LPT's category heaps, the value
+ * is either the amount of free space or the amount of dirty space, depending
+ * on the category.
+ */
+static void move_up_lpt_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap,
+ struct ubifs_lprops *lprops, int cat)
+{
+ int val1, val2, hpos;
+
+ hpos = lprops->hpos;
+ if (!hpos)
+ return; /* Already top of the heap */
+ val1 = get_heap_comp_val(lprops, cat);
+ /* Compare to parent and, if greater, move up the heap */
+ do {
+ int ppos = (hpos - 1) / 2;
+
+ val2 = get_heap_comp_val(heap->arr[ppos], cat);
+ if (val2 >= val1)
+ return;
+ /* Greater than parent so move up */
+ heap->arr[ppos]->hpos = hpos;
+ heap->arr[hpos] = heap->arr[ppos];
+ heap->arr[ppos] = lprops;
+ lprops->hpos = ppos;
+ hpos = ppos;
+ } while (hpos);
+}
+
+/**
+ * adjust_lpt_heap - move a changed heap entry up or down the heap.
+ * @c: UBIFS file-system description object
+ * @heap: LEB category heap
+ * @lprops: LEB properties to move
+ * @hpos: heap position of @lprops
+ * @cat: LEB category
+ *
+ * Changed entries in a heap are moved up or down until the parent's value is
+ * greater. In the case of LPT's category heaps, the value is either the amount
+ * of free space or the amount of dirty space, depending on the category.
+ */
+static void adjust_lpt_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap,
+ struct ubifs_lprops *lprops, int hpos, int cat)
+{
+ int val1, val2, val3, cpos;
+
+ val1 = get_heap_comp_val(lprops, cat);
+ /* Compare to parent and, if greater than parent, move up the heap */
+ if (hpos) {
+ int ppos = (hpos - 1) / 2;
+
+ val2 = get_heap_comp_val(heap->arr[ppos], cat);
+ if (val1 > val2) {
+ /* Greater than parent so move up */
+ while (1) {
+ heap->arr[ppos]->hpos = hpos;
+ heap->arr[hpos] = heap->arr[ppos];
+ heap->arr[ppos] = lprops;
+ lprops->hpos = ppos;
+ hpos = ppos;
+ if (!hpos)
+ return;
+ ppos = (hpos - 1) / 2;
+ val2 = get_heap_comp_val(heap->arr[ppos], cat);
+ if (val1 <= val2)
+ return;
+ /* Still greater than parent so keep going */
+ }
+ }
+ }
+ /* Not greater than parent, so compare to children */
+ while (1) {
+ /* Compare to left child */
+ cpos = hpos * 2 + 1;
+ if (cpos >= heap->cnt)
+ return;
+ val2 = get_heap_comp_val(heap->arr[cpos], cat);
+ if (val1 < val2) {
+ /* Less than left child, so promote biggest child */
+ if (cpos + 1 < heap->cnt) {
+ val3 = get_heap_comp_val(heap->arr[cpos + 1],
+ cat);
+ if (val3 > val2)
+ cpos += 1; /* Right child is bigger */
+ }
+ heap->arr[cpos]->hpos = hpos;
+ heap->arr[hpos] = heap->arr[cpos];
+ heap->arr[cpos] = lprops;
+ lprops->hpos = cpos;
+ hpos = cpos;
+ continue;
+ }
+ /* Compare to right child */
+ cpos += 1;
+ if (cpos >= heap->cnt)
+ return;
+ val3 = get_heap_comp_val(heap->arr[cpos], cat);
+ if (val1 < val3) {
+ /* Less than right child, so promote right child */
+ heap->arr[cpos]->hpos = hpos;
+ heap->arr[hpos] = heap->arr[cpos];
+ heap->arr[cpos] = lprops;
+ lprops->hpos = cpos;
+ hpos = cpos;
+ continue;
+ }
+ return;
+ }
+}
+
+/**
+ * add_to_lpt_heap - add LEB properties to a LEB category heap.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties to add
+ * @cat: LEB category
+ *
+ * This function returns %1 if @lprops is added to the heap for LEB category
+ * @cat, otherwise %0 is returned because the heap is full.
+ */
+static int add_to_lpt_heap(struct ubifs_info *c, struct ubifs_lprops *lprops,
+ int cat)
+{
+ struct ubifs_lpt_heap *heap = &c->lpt_heap[cat - 1];
+
+ if (heap->cnt >= heap->max_cnt) {
+ const int b = LPT_HEAP_SZ / 2 - 1;
+ int cpos, val1, val2;
+
+ /* Compare to some other LEB on the bottom of heap */
+ /* Pick a position kind of randomly */
+ cpos = (((size_t)lprops >> 4) & b) + b;
+ ubifs_assert(cpos >= b);
+ ubifs_assert(cpos < LPT_HEAP_SZ);
+ ubifs_assert(cpos < heap->cnt);
+
+ val1 = get_heap_comp_val(lprops, cat);
+ val2 = get_heap_comp_val(heap->arr[cpos], cat);
+ if (val1 > val2) {
+ struct ubifs_lprops *lp;
+
+ lp = heap->arr[cpos];
+ lp->flags &= ~LPROPS_CAT_MASK;
+ lp->flags |= LPROPS_UNCAT;
+ list_add(&lp->list, &c->uncat_list);
+ lprops->hpos = cpos;
+ heap->arr[cpos] = lprops;
+ move_up_lpt_heap(c, heap, lprops, cat);
+ dbg_check_heap(c, heap, cat, lprops->hpos);
+ return 1; /* Added to heap */
+ }
+ dbg_check_heap(c, heap, cat, -1);
+ return 0; /* Not added to heap */
+ } else {
+ lprops->hpos = heap->cnt++;
+ heap->arr[lprops->hpos] = lprops;
+ move_up_lpt_heap(c, heap, lprops, cat);
+ dbg_check_heap(c, heap, cat, lprops->hpos);
+ return 1; /* Added to heap */
+ }
+}
+
+/**
+ * remove_from_lpt_heap - remove LEB properties from a LEB category heap.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties to remove
+ * @cat: LEB category
+ */
+static void remove_from_lpt_heap(struct ubifs_info *c,
+ struct ubifs_lprops *lprops, int cat)
+{
+ struct ubifs_lpt_heap *heap;
+ int hpos = lprops->hpos;
+
+ heap = &c->lpt_heap[cat - 1];
+ ubifs_assert(hpos >= 0 && hpos < heap->cnt);
+ ubifs_assert(heap->arr[hpos] == lprops);
+ heap->cnt -= 1;
+ if (hpos < heap->cnt) {
+ heap->arr[hpos] = heap->arr[heap->cnt];
+ heap->arr[hpos]->hpos = hpos;
+ adjust_lpt_heap(c, heap, heap->arr[hpos], hpos, cat);
+ }
+ dbg_check_heap(c, heap, cat, -1);
+}
+
+/**
+ * lpt_heap_replace - replace lprops in a category heap.
+ * @c: UBIFS file-system description object
+ * @old_lprops: LEB properties to replace
+ * @new_lprops: LEB properties with which to replace
+ * @cat: LEB category
+ *
+ * During commit it is sometimes necessary to copy a pnode (see dirty_cow_pnode)
+ * and the lprops that the pnode contains. When that happens, references in
+ * the category heaps to those lprops must be updated to point to the new
+ * lprops. This function does that.
+ */
+static void lpt_heap_replace(struct ubifs_info *c,
+ struct ubifs_lprops *old_lprops,
+ struct ubifs_lprops *new_lprops, int cat)
+{
+ struct ubifs_lpt_heap *heap;
+ int hpos = new_lprops->hpos;
+
+ heap = &c->lpt_heap[cat - 1];
+ heap->arr[hpos] = new_lprops;
+}
+
+/**
+ * ubifs_add_to_cat - add LEB properties to a category list or heap.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties to add
+ * @cat: LEB category to which to add
+ *
+ * LEB properties are categorized to enable fast find operations.
+ */
+void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops,
+ int cat)
+{
+ switch (cat) {
+ case LPROPS_DIRTY:
+ case LPROPS_DIRTY_IDX:
+ case LPROPS_FREE:
+ if (add_to_lpt_heap(c, lprops, cat))
+ break;
+ /* No more room on heap so make it uncategorized */
+ cat = LPROPS_UNCAT;
+ /* Fall through */
+ case LPROPS_UNCAT:
+ list_add(&lprops->list, &c->uncat_list);
+ break;
+ case LPROPS_EMPTY:
+ list_add(&lprops->list, &c->empty_list);
+ break;
+ case LPROPS_FREEABLE:
+ list_add(&lprops->list, &c->freeable_list);
+ c->freeable_cnt += 1;
+ break;
+ case LPROPS_FRDI_IDX:
+ list_add(&lprops->list, &c->frdi_idx_list);
+ break;
+ default:
+ ubifs_assert(0);
+ }
+ lprops->flags &= ~LPROPS_CAT_MASK;
+ lprops->flags |= cat;
+}
+
+/**
+ * ubifs_remove_from_cat - remove LEB properties from a category list or heap.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties to remove
+ * @cat: LEB category from which to remove
+ *
+ * LEB properties are categorized to enable fast find operations.
+ */
+static void ubifs_remove_from_cat(struct ubifs_info *c,
+ struct ubifs_lprops *lprops, int cat)
+{
+ switch (cat) {
+ case LPROPS_DIRTY:
+ case LPROPS_DIRTY_IDX:
+ case LPROPS_FREE:
+ remove_from_lpt_heap(c, lprops, cat);
+ break;
+ case LPROPS_FREEABLE:
+ c->freeable_cnt -= 1;
+ ubifs_assert(c->freeable_cnt >= 0);
+ /* Fall through */
+ case LPROPS_UNCAT:
+ case LPROPS_EMPTY:
+ case LPROPS_FRDI_IDX:
+ ubifs_assert(!list_empty(&lprops->list));
+ list_del(&lprops->list);
+ break;
+ default:
+ ubifs_assert(0);
+ }
+}
+
+/**
+ * ubifs_replace_cat - replace lprops in a category list or heap.
+ * @c: UBIFS file-system description object
+ * @old_lprops: LEB properties to replace
+ * @new_lprops: LEB properties with which to replace
+ *
+ * During commit it is sometimes necessary to copy a pnode (see dirty_cow_pnode)
+ * and the lprops that the pnode contains. When that happens, references in
+ * category lists and heaps must be replaced. This function does that.
+ */
+void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops,
+ struct ubifs_lprops *new_lprops)
+{
+ int cat;
+
+ cat = new_lprops->flags & LPROPS_CAT_MASK;
+ switch (cat) {
+ case LPROPS_DIRTY:
+ case LPROPS_DIRTY_IDX:
+ case LPROPS_FREE:
+ lpt_heap_replace(c, old_lprops, new_lprops, cat);
+ break;
+ case LPROPS_UNCAT:
+ case LPROPS_EMPTY:
+ case LPROPS_FREEABLE:
+ case LPROPS_FRDI_IDX:
+ list_replace(&old_lprops->list, &new_lprops->list);
+ break;
+ default:
+ ubifs_assert(0);
+ }
+}
+
+/**
+ * ubifs_ensure_cat - ensure LEB properties are categorized.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties
+ *
+ * A LEB may have fallen off of the bottom of a heap, and ended up as
+ * uncategorized even though it has enough space for us now. If that is the case
+ * this function will put the LEB back onto a heap.
+ */
+void ubifs_ensure_cat(struct ubifs_info *c, struct ubifs_lprops *lprops)
+{
+ int cat = lprops->flags & LPROPS_CAT_MASK;
+
+ if (cat != LPROPS_UNCAT)
+ return;
+ cat = ubifs_categorize_lprops(c, lprops);
+ if (cat == LPROPS_UNCAT)
+ return;
+ ubifs_remove_from_cat(c, lprops, LPROPS_UNCAT);
+ ubifs_add_to_cat(c, lprops, cat);
+}
+
+/**
+ * ubifs_categorize_lprops - categorize LEB properties.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties to categorize
+ *
+ * LEB properties are categorized to enable fast find operations. This function
+ * returns the LEB category to which the LEB properties belong. Note however
+ * that if the LEB category is stored as a heap and the heap is full, the
+ * LEB properties may have their category changed to %LPROPS_UNCAT.
+ */
+int ubifs_categorize_lprops(const struct ubifs_info *c,
+ const struct ubifs_lprops *lprops)
+{
+ if (lprops->flags & LPROPS_TAKEN)
+ return LPROPS_UNCAT;
+
+ if (lprops->free == c->leb_size) {
+ ubifs_assert(!(lprops->flags & LPROPS_INDEX));
+ return LPROPS_EMPTY;
+ }
+
+ if (lprops->free + lprops->dirty == c->leb_size) {
+ if (lprops->flags & LPROPS_INDEX)
+ return LPROPS_FRDI_IDX;
+ else
+ return LPROPS_FREEABLE;
+ }
+
+ if (lprops->flags & LPROPS_INDEX) {
+ if (lprops->dirty + lprops->free >= c->min_idx_node_sz)
+ return LPROPS_DIRTY_IDX;
+ } else {
+ if (lprops->dirty >= c->dead_wm &&
+ lprops->dirty > lprops->free)
+ return LPROPS_DIRTY;
+ if (lprops->free > 0)
+ return LPROPS_FREE;
+ }
+
+ return LPROPS_UNCAT;
+}
+
+/**
+ * change_category - change LEB properties category.
+ * @c: UBIFS file-system description object
+ * @lprops: LEB properties to recategorize
+ *
+ * LEB properties are categorized to enable fast find operations. When the LEB
+ * properties change they must be recategorized.
+ */
+static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops)
+{
+ int old_cat = lprops->flags & LPROPS_CAT_MASK;
+ int new_cat = ubifs_categorize_lprops(c, lprops);
+
+ if (old_cat == new_cat) {
+ struct ubifs_lpt_heap *heap = &c->lpt_heap[new_cat - 1];
+
+ /* lprops on a heap now must be moved up or down */
+ if (new_cat < 1 || new_cat > LPROPS_HEAP_CNT)
+ return; /* Not on a heap */
+ heap = &c->lpt_heap[new_cat - 1];
+ adjust_lpt_heap(c, heap, lprops, lprops->hpos, new_cat);
+ } else {
+ ubifs_remove_from_cat(c, lprops, old_cat);
+ ubifs_add_to_cat(c, lprops, new_cat);
+ }
+}
+
+/**
+ * ubifs_get_lprops - get reference to LEB properties.
+ * @c: the UBIFS file-system description object
+ *
+ * This function locks lprops. Lprops have to be unlocked by
+ * 'ubifs_release_lprops()'.
+ */
+void ubifs_get_lprops(struct ubifs_info *c)
+{
+ mutex_lock(&c->lp_mutex);
+}
+
+/**
+ * calc_dark - calculate LEB dark space size.
+ * @c: the UBIFS file-system description object
+ * @spc: amount of free and dirty space in the LEB
+ *
+ * This function calculates amount of dark space in an LEB which has @spc bytes
+ * of free and dirty space. Returns the calculations result.
+ *
+ * Dark space is the space which is not always usable - it depends on which
+ * nodes are written in which order. E.g., if an LEB has only 512 free bytes,
+ * it is dark space, because it cannot fit a large data node. So UBIFS cannot
+ * count on this LEB and treat these 512 bytes as usable because it is not true
+ * if, for example, only big chunks of uncompressible data will be written to
+ * the FS.
+ */
+static int calc_dark(struct ubifs_info *c, int spc)
+{
+ ubifs_assert(!(spc & 7));
+
+ if (spc < c->dark_wm)
+ return spc;
+
+ /*
+ * If we have slightly more space then the dark space watermark, we can
+ * anyway safely assume it we'll be able to write a node of the
+ * smallest size there.
+ */
+ if (spc - c->dark_wm < MIN_WRITE_SZ)
+ return spc - MIN_WRITE_SZ;
+
+ return c->dark_wm;
+}
+
+/**
+ * is_lprops_dirty - determine if LEB properties are dirty.
+ * @c: the UBIFS file-system description object
+ * @lprops: LEB properties to test
+ */
+static int is_lprops_dirty(struct ubifs_info *c, struct ubifs_lprops *lprops)
+{
+ struct ubifs_pnode *pnode;
+ int pos;
+
+ pos = (lprops->lnum - c->main_first) & (UBIFS_LPT_FANOUT - 1);
+ pnode = (struct ubifs_pnode *)container_of(lprops - pos,
+ struct ubifs_pnode,
+ lprops[0]);
+ return !test_bit(COW_ZNODE, &pnode->flags) &&
+ test_bit(DIRTY_CNODE, &pnode->flags);
+}
+
+/**
+ * ubifs_change_lp - change LEB properties.
+ * @c: the UBIFS file-system description object
+ * @lp: LEB properties to change
+ * @free: new free space amount
+ * @dirty: new dirty space amount
+ * @flags: new flags
+ * @idx_gc_cnt: change to the count of idx_gc list
+ *
+ * This function changes LEB properties. This function does not change a LEB
+ * property (@free, @dirty or @flag) if the value passed is %LPROPS_NC.
+ *
+ * This function returns a pointer to the updated LEB properties on success
+ * and a negative error code on failure. N.B. the LEB properties may have had to
+ * be copied (due to COW) and consequently the pointer returned may not be the
+ * same as the pointer passed.
+ */
+const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
+ const struct ubifs_lprops *lp,
+ int free, int dirty, int flags,
+ int idx_gc_cnt)
+{
+ /*
+ * This is the only function that is allowed to change lprops, so we
+ * discard the const qualifier.
+ */
+ struct ubifs_lprops *lprops = (struct ubifs_lprops *)lp;
+
+ dbg_lp("LEB %d, free %d, dirty %d, flags %d",
+ lprops->lnum, free, dirty, flags);
+
+ ubifs_assert(mutex_is_locked(&c->lp_mutex));
+ ubifs_assert(c->lst.empty_lebs >= 0 &&
+ c->lst.empty_lebs <= c->main_lebs);
+ ubifs_assert(c->freeable_cnt >= 0);
+ ubifs_assert(c->freeable_cnt <= c->main_lebs);
+ ubifs_assert(c->lst.taken_empty_lebs >= 0);
+ ubifs_assert(c->lst.taken_empty_lebs <= c->lst.empty_lebs);
+ ubifs_assert(!(c->lst.total_free & 7) && !(c->lst.total_dirty & 7));
+ ubifs_assert(!(c->lst.total_dead & 7) && !(c->lst.total_dark & 7));
+ ubifs_assert(!(c->lst.total_used & 7));
+ ubifs_assert(free == LPROPS_NC || free >= 0);
+ ubifs_assert(dirty == LPROPS_NC || dirty >= 0);
+
+ if (!is_lprops_dirty(c, lprops)) {
+ lprops = ubifs_lpt_lookup_dirty(c, lprops->lnum);
+ if (IS_ERR(lprops))
+ return lprops;
+ } else
+ ubifs_assert(lprops == ubifs_lpt_lookup_dirty(c, lprops->lnum));
+
+ ubifs_assert(!(lprops->free & 7) && !(lprops->dirty & 7));
+
+ spin_lock(&c->space_lock);
+
+ if ((lprops->flags & LPROPS_TAKEN) && lprops->free == c->leb_size)
+ c->lst.taken_empty_lebs -= 1;
+
+ if (!(lprops->flags & LPROPS_INDEX)) {
+ int old_spc;
+
+ old_spc = lprops->free + lprops->dirty;
+ if (old_spc < c->dead_wm)
+ c->lst.total_dead -= old_spc;
+ else
+ c->lst.total_dark -= calc_dark(c, old_spc);
+
+ c->lst.total_used -= c->leb_size - old_spc;
+ }
+
+ if (free != LPROPS_NC) {
+ free = ALIGN(free, 8);
+ c->lst.total_free += free - lprops->free;
+
+ /* Increase or decrease empty LEBs counter if needed */
+ if (free == c->leb_size) {
+ if (lprops->free != c->leb_size)
+ c->lst.empty_lebs += 1;
+ } else if (lprops->free == c->leb_size)
+ c->lst.empty_lebs -= 1;
+ lprops->free = free;
+ }
+
+ if (dirty != LPROPS_NC) {
+ dirty = ALIGN(dirty, 8);
+ c->lst.total_dirty += dirty - lprops->dirty;
+ lprops->dirty = dirty;
+ }
+
+ if (flags != LPROPS_NC) {
+ /* Take care about indexing LEBs counter if needed */
+ if ((lprops->flags & LPROPS_INDEX)) {
+ if (!(flags & LPROPS_INDEX))
+ c->lst.idx_lebs -= 1;
+ } else if (flags & LPROPS_INDEX)
+ c->lst.idx_lebs += 1;
+ lprops->flags = flags;
+ }
+
+ if (!(lprops->flags & LPROPS_INDEX)) {
+ int new_spc;
+
+ new_spc = lprops->free + lprops->dirty;
+ if (new_spc < c->dead_wm)
+ c->lst.total_dead += new_spc;
+ else
+ c->lst.total_dark += calc_dark(c, new_spc);
+
+ c->lst.total_used += c->leb_size - new_spc;
+ }
+
+ if ((lprops->flags & LPROPS_TAKEN) && lprops->free == c->leb_size)
+ c->lst.taken_empty_lebs += 1;
+
+ change_category(c, lprops);
+
+ c->idx_gc_cnt += idx_gc_cnt;
+
+ spin_unlock(&c->space_lock);
+
+ return lprops;
+}
+
+/**
+ * ubifs_release_lprops - release lprops lock.
+ * @c: the UBIFS file-system description object
+ *
+ * This function has to be called after each 'ubifs_get_lprops()' call to
+ * unlock lprops.
+ */
+void ubifs_release_lprops(struct ubifs_info *c)
+{
+ ubifs_assert(mutex_is_locked(&c->lp_mutex));
+ ubifs_assert(c->lst.empty_lebs >= 0 &&
+ c->lst.empty_lebs <= c->main_lebs);
+
+ mutex_unlock(&c->lp_mutex);
+}
+
+/**
+ * ubifs_get_lp_stats - get lprops statistics.
+ * @c: UBIFS file-system description object
+ * @st: return statistics
+ */
+void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *st)
+{
+ spin_lock(&c->space_lock);
+ memcpy(st, &c->lst, sizeof(struct ubifs_lp_stats));
+ spin_unlock(&c->space_lock);
+}
+
+/**
+ * ubifs_change_one_lp - change LEB properties.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB to change properties for
+ * @free: amount of free space
+ * @dirty: amount of dirty space
+ * @flags_set: flags to set
+ * @flags_clean: flags to clean
+ * @idx_gc_cnt: change to the count of idx_gc list
+ *
+ * This function changes properties of LEB @lnum. It is a helper wrapper over
+ * 'ubifs_change_lp()' which hides lprops get/release. The arguments are the
+ * same as in case of 'ubifs_change_lp()'. Returns zero in case of success and
+ * a negative error code in case of failure.
+ */
+int ubifs_change_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
+ int flags_set, int flags_clean, int idx_gc_cnt)
+{
+ int err = 0, flags;
+ const struct ubifs_lprops *lp;
+
+ ubifs_get_lprops(c);
+
+ lp = ubifs_lpt_lookup_dirty(c, lnum);
+ if (IS_ERR(lp)) {
+ err = PTR_ERR(lp);
+ goto out;
+ }
+
+ flags = (lp->flags | flags_set) & ~flags_clean;
+ lp = ubifs_change_lp(c, lp, free, dirty, flags, idx_gc_cnt);
+ if (IS_ERR(lp))
+ err = PTR_ERR(lp);
+
+out:
+ ubifs_release_lprops(c);
+ return err;
+}
+
+/**
+ * ubifs_update_one_lp - update LEB properties.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB to change properties for
+ * @free: amount of free space
+ * @dirty: amount of dirty space to add
+ * @flags_set: flags to set
+ * @flags_clean: flags to clean
+ *
+ * This function is the same as 'ubifs_change_one_lp()' but @dirty is added to
+ * current dirty space, not substitutes it.
+ */
+int ubifs_update_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
+ int flags_set, int flags_clean)
+{
+ int err = 0, flags;
+ const struct ubifs_lprops *lp;
+
+ ubifs_get_lprops(c);
+
+ lp = ubifs_lpt_lookup_dirty(c, lnum);
+ if (IS_ERR(lp)) {
+ err = PTR_ERR(lp);
+ goto out;
+ }
+
+ flags = (lp->flags | flags_set) & ~flags_clean;
+ lp = ubifs_change_lp(c, lp, free, lp->dirty + dirty, flags, 0);
+ if (IS_ERR(lp))
+ err = PTR_ERR(lp);
+
+out:
+ ubifs_release_lprops(c);
+ return err;
+}
+
+/**
+ * ubifs_read_one_lp - read LEB properties.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB to read properties for
+ * @lp: where to store read properties
+ *
+ * This helper function reads properties of a LEB @lnum and stores them in @lp.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+int ubifs_read_one_lp(struct ubifs_info *c, int lnum, struct ubifs_lprops *lp)
+{
+ int err = 0;
+ const struct ubifs_lprops *lpp;
+
+ ubifs_get_lprops(c);
+
+ lpp = ubifs_lpt_lookup(c, lnum);
+ if (IS_ERR(lpp)) {
+ err = PTR_ERR(lpp);
+ goto out;
+ }
+
+ memcpy(lp, lpp, sizeof(struct ubifs_lprops));
+
+out:
+ ubifs_release_lprops(c);
+ return err;
+}
+
+/**
+ * ubifs_fast_find_free - try to find a LEB with free space quickly.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns LEB properties for a LEB with free space or %NULL if
+ * the function is unable to find a LEB quickly.
+ */
+const struct ubifs_lprops *ubifs_fast_find_free(struct ubifs_info *c)
+{
+ struct ubifs_lprops *lprops;
+ struct ubifs_lpt_heap *heap;
+
+ ubifs_assert(mutex_is_locked(&c->lp_mutex));
+
+ heap = &c->lpt_heap[LPROPS_FREE - 1];
+ if (heap->cnt == 0)
+ return NULL;
+
+ lprops = heap->arr[0];
+ ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+ ubifs_assert(!(lprops->flags & LPROPS_INDEX));
+ return lprops;
+}
+
+/**
+ * ubifs_fast_find_empty - try to find an empty LEB quickly.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns LEB properties for an empty LEB or %NULL if the
+ * function is unable to find an empty LEB quickly.
+ */
+const struct ubifs_lprops *ubifs_fast_find_empty(struct ubifs_info *c)
+{
+ struct ubifs_lprops *lprops;
+
+ ubifs_assert(mutex_is_locked(&c->lp_mutex));
+
+ if (list_empty(&c->empty_list))
+ return NULL;
+
+ lprops = list_entry(c->empty_list.next, struct ubifs_lprops, list);
+ ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+ ubifs_assert(!(lprops->flags & LPROPS_INDEX));
+ ubifs_assert(lprops->free == c->leb_size);
+ return lprops;
+}
+
+/**
+ * ubifs_fast_find_freeable - try to find a freeable LEB quickly.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns LEB properties for a freeable LEB or %NULL if the
+ * function is unable to find a freeable LEB quickly.
+ */
+const struct ubifs_lprops *ubifs_fast_find_freeable(struct ubifs_info *c)
+{
+ struct ubifs_lprops *lprops;
+
+ ubifs_assert(mutex_is_locked(&c->lp_mutex));
+
+ if (list_empty(&c->freeable_list))
+ return NULL;
+
+ lprops = list_entry(c->freeable_list.next, struct ubifs_lprops, list);
+ ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+ ubifs_assert(!(lprops->flags & LPROPS_INDEX));
+ ubifs_assert(lprops->free + lprops->dirty == c->leb_size);
+ ubifs_assert(c->freeable_cnt > 0);
+ return lprops;
+}
+
+/**
+ * ubifs_fast_find_frdi_idx - try to find a freeable index LEB quickly.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns LEB properties for a freeable index LEB or %NULL if the
+ * function is unable to find a freeable index LEB quickly.
+ */
+const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c)
+{
+ struct ubifs_lprops *lprops;
+
+ ubifs_assert(mutex_is_locked(&c->lp_mutex));
+
+ if (list_empty(&c->frdi_idx_list))
+ return NULL;
+
+ lprops = list_entry(c->frdi_idx_list.next, struct ubifs_lprops, list);
+ ubifs_assert(!(lprops->flags & LPROPS_TAKEN));
+ ubifs_assert((lprops->flags & LPROPS_INDEX));
+ ubifs_assert(lprops->free + lprops->dirty == c->leb_size);
+ return lprops;
+}
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+
+/**
+ * dbg_check_cats - check category heaps and lists.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int dbg_check_cats(struct ubifs_info *c)
+{
+ struct ubifs_lprops *lprops;
+ struct list_head *pos;
+ int i, cat;
+
+ if (!(ubifs_chk_flags & (UBIFS_CHK_GEN | UBIFS_CHK_LPROPS)))
+ return 0;
+
+ list_for_each_entry(lprops, &c->empty_list, list) {
+ if (lprops->free != c->leb_size) {
+ ubifs_err("non-empty LEB %d on empty list "
+ "(free %d dirty %d flags %d)", lprops->lnum,
+ lprops->free, lprops->dirty, lprops->flags);
+ return -EINVAL;
+ }
+ if (lprops->flags & LPROPS_TAKEN) {
+ ubifs_err("taken LEB %d on empty list "
+ "(free %d dirty %d flags %d)", lprops->lnum,
+ lprops->free, lprops->dirty, lprops->flags);
+ return -EINVAL;
+ }
+ }
+
+ i = 0;
+ list_for_each_entry(lprops, &c->freeable_list, list) {
+ if (lprops->free + lprops->dirty != c->leb_size) {
+ ubifs_err("non-freeable LEB %d on freeable list "
+ "(free %d dirty %d flags %d)", lprops->lnum,
+ lprops->free, lprops->dirty, lprops->flags);
+ return -EINVAL;
+ }
+ if (lprops->flags & LPROPS_TAKEN) {
+ ubifs_err("taken LEB %d on freeable list "
+ "(free %d dirty %d flags %d)", lprops->lnum,
+ lprops->free, lprops->dirty, lprops->flags);
+ return -EINVAL;
+ }
+ i += 1;
+ }
+ if (i != c->freeable_cnt) {
+ ubifs_err("freeable list count %d expected %d", i,
+ c->freeable_cnt);
+ return -EINVAL;
+ }
+
+ i = 0;
+ list_for_each(pos, &c->idx_gc)
+ i += 1;
+ if (i != c->idx_gc_cnt) {
+ ubifs_err("idx_gc list count %d expected %d", i,
+ c->idx_gc_cnt);
+ return -EINVAL;
+ }
+
+ list_for_each_entry(lprops, &c->frdi_idx_list, list) {
+ if (lprops->free + lprops->dirty != c->leb_size) {
+ ubifs_err("non-freeable LEB %d on frdi_idx list "
+ "(free %d dirty %d flags %d)", lprops->lnum,
+ lprops->free, lprops->dirty, lprops->flags);
+ return -EINVAL;
+ }
+ if (lprops->flags & LPROPS_TAKEN) {
+ ubifs_err("taken LEB %d on frdi_idx list "
+ "(free %d dirty %d flags %d)", lprops->lnum,
+ lprops->free, lprops->dirty, lprops->flags);
+ return -EINVAL;
+ }
+ if (!(lprops->flags & LPROPS_INDEX)) {
+ ubifs_err("non-index LEB %d on frdi_idx list "
+ "(free %d dirty %d flags %d)", lprops->lnum,
+ lprops->free, lprops->dirty, lprops->flags);
+ return -EINVAL;
+ }
+ }
+
+ for (cat = 1; cat <= LPROPS_HEAP_CNT; cat++) {
+ struct ubifs_lpt_heap *heap = &c->lpt_heap[cat - 1];
+
+ for (i = 0; i < heap->cnt; i++) {
+ lprops = heap->arr[i];
+ if (!lprops) {
+ ubifs_err("null ptr in LPT heap cat %d", cat);
+ return -EINVAL;
+ }
+ if (lprops->hpos != i) {
+ ubifs_err("bad ptr in LPT heap cat %d", cat);
+ return -EINVAL;
+ }
+ if (lprops->flags & LPROPS_TAKEN) {
+ ubifs_err("taken LEB in LPT heap cat %d", cat);
+ return -EINVAL;
+ }
+ }
+ }
+
+ return 0;
+}
+
+void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat,
+ int add_pos)
+{
+ int i = 0, j, err = 0;
+
+ if (!(ubifs_chk_flags & (UBIFS_CHK_GEN | UBIFS_CHK_LPROPS)))
+ return;
+
+ for (i = 0; i < heap->cnt; i++) {
+ struct ubifs_lprops *lprops = heap->arr[i];
+ struct ubifs_lprops *lp;
+
+ if (i != add_pos)
+ if ((lprops->flags & LPROPS_CAT_MASK) != cat) {
+ err = 1;
+ goto out;
+ }
+ if (lprops->hpos != i) {
+ err = 2;
+ goto out;
+ }
+ lp = ubifs_lpt_lookup(c, lprops->lnum);
+ if (IS_ERR(lp)) {
+ err = 3;
+ goto out;
+ }
+ if (lprops != lp) {
+ dbg_msg("lprops %zx lp %zx lprops->lnum %d lp->lnum %d",
+ (size_t)lprops, (size_t)lp, lprops->lnum,
+ lp->lnum);
+ err = 4;
+ goto out;
+ }
+ for (j = 0; j < i; j++) {
+ lp = heap->arr[j];
+ if (lp == lprops) {
+ err = 5;
+ goto out;
+ }
+ if (lp->lnum == lprops->lnum) {
+ err = 6;
+ goto out;
+ }
+ }
+ }
+out:
+ if (err) {
+ dbg_msg("failed cat %d hpos %d err %d", cat, i, err);
+ dbg_dump_stack();
+ dbg_dump_heap(c, heap, cat);
+ }
+}
+
+/**
+ * struct scan_check_data - data provided to scan callback function.
+ * @lst: LEB properties statistics
+ * @err: error code
+ */
+struct scan_check_data {
+ struct ubifs_lp_stats lst;
+ int err;
+};
+
+/**
+ * scan_check_cb - scan callback.
+ * @c: the UBIFS file-system description object
+ * @lp: LEB properties to scan
+ * @in_tree: whether the LEB properties are in main memory
+ * @data: information passed to and from the caller of the scan
+ *
+ * This function returns a code that indicates whether the scan should continue
+ * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree
+ * in main memory (%LPT_SCAN_ADD), or whether the scan should stop
+ * (%LPT_SCAN_STOP).
+ */
+static int scan_check_cb(struct ubifs_info *c,
+ const struct ubifs_lprops *lp, int in_tree,
+ struct scan_check_data *data)
+{
+ struct ubifs_scan_leb *sleb;
+ struct ubifs_scan_node *snod;
+ struct ubifs_lp_stats *lst = &data->lst;
+ int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty;
+
+ cat = lp->flags & LPROPS_CAT_MASK;
+ if (cat != LPROPS_UNCAT) {
+ cat = ubifs_categorize_lprops(c, lp);
+ if (cat != (lp->flags & LPROPS_CAT_MASK)) {
+ ubifs_err("bad LEB category %d expected %d",
+ (lp->flags & LPROPS_CAT_MASK), cat);
+ goto out;
+ }
+ }
+
+ /* Check lp is on its category list (if it has one) */
+ if (in_tree) {
+ struct list_head *list = NULL;
+
+ switch (cat) {
+ case LPROPS_EMPTY:
+ list = &c->empty_list;
+ break;
+ case LPROPS_FREEABLE:
+ list = &c->freeable_list;
+ break;
+ case LPROPS_FRDI_IDX:
+ list = &c->frdi_idx_list;
+ break;
+ case LPROPS_UNCAT:
+ list = &c->uncat_list;
+ break;
+ }
+ if (list) {
+ struct ubifs_lprops *lprops;
+ int found = 0;
+
+ list_for_each_entry(lprops, list, list) {
+ if (lprops == lp) {
+ found = 1;
+ break;
+ }
+ }
+ if (!found) {
+ ubifs_err("bad LPT list (category %d)", cat);
+ goto out;
+ }
+ }
+ }
+
+ /* Check lp is on its category heap (if it has one) */
+ if (in_tree && cat > 0 && cat <= LPROPS_HEAP_CNT) {
+ struct ubifs_lpt_heap *heap = &c->lpt_heap[cat - 1];
+
+ if ((lp->hpos != -1 && heap->arr[lp->hpos]->lnum != lnum) ||
+ lp != heap->arr[lp->hpos]) {
+ ubifs_err("bad LPT heap (category %d)", cat);
+ goto out;
+ }
+ }
+
+ sleb = ubifs_scan(c, lnum, 0, c->dbg_buf);
+ if (IS_ERR(sleb)) {
+ /*
+ * After an unclean unmount, empty and freeable LEBs
+ * may contain garbage.
+ */
+ if (lp->free == c->leb_size) {
+ ubifs_err("scan errors were in empty LEB "
+ "- continuing checking");
+ lst->empty_lebs += 1;
+ lst->total_free += c->leb_size;
+ lst->total_dark += calc_dark(c, c->leb_size);
+ return LPT_SCAN_CONTINUE;
+ }
+
+ if (lp->free + lp->dirty == c->leb_size &&
+ !(lp->flags & LPROPS_INDEX)) {
+ ubifs_err("scan errors were in freeable LEB "
+ "- continuing checking");
+ lst->total_free += lp->free;
+ lst->total_dirty += lp->dirty;
+ lst->total_dark += calc_dark(c, c->leb_size);
+ return LPT_SCAN_CONTINUE;
+ }
+ data->err = PTR_ERR(sleb);
+ return LPT_SCAN_STOP;
+ }
+
+ is_idx = -1;
+ list_for_each_entry(snod, &sleb->nodes, list) {
+ int found, level = 0;
+
+ cond_resched();
+
+ if (is_idx == -1)
+ is_idx = (snod->type == UBIFS_IDX_NODE) ? 1 : 0;
+
+ if (is_idx && snod->type != UBIFS_IDX_NODE) {
+ ubifs_err("indexing node in data LEB %d:%d",
+ lnum, snod->offs);
+ goto out_destroy;
+ }
+
+ if (snod->type == UBIFS_IDX_NODE) {
+ struct ubifs_idx_node *idx = snod->node;
+
+ key_read(c, ubifs_idx_key(c, idx), &snod->key);
+ level = le16_to_cpu(idx->level);
+ }
+
+ found = ubifs_tnc_has_node(c, &snod->key, level, lnum,
+ snod->offs, is_idx);
+ if (found) {
+ if (found < 0)
+ goto out_destroy;
+ used += ALIGN(snod->len, 8);
+ }
+ }
+
+ free = c->leb_size - sleb->endpt;
+ dirty = sleb->endpt - used;
+
+ if (free > c->leb_size || free < 0 || dirty > c->leb_size ||
+ dirty < 0) {
+ ubifs_err("bad calculated accounting for LEB %d: "
+ "free %d, dirty %d", lnum, free, dirty);
+ goto out_destroy;
+ }
+
+ if (lp->free + lp->dirty == c->leb_size &&
+ free + dirty == c->leb_size)
+ if ((is_idx && !(lp->flags & LPROPS_INDEX)) ||
+ (!is_idx && free == c->leb_size) ||
+ lp->free == c->leb_size) {
+ /*
+ * Empty or freeable LEBs could contain index
+ * nodes from an uncompleted commit due to an
+ * unclean unmount. Or they could be empty for
+ * the same reason. Or it may simply not have been
+ * unmapped.
+ */
+ free = lp->free;
+ dirty = lp->dirty;
+ is_idx = 0;
+ }
+
+ if (is_idx && lp->free + lp->dirty == free + dirty &&
+ lnum != c->ihead_lnum) {
+ /*
+ * After an unclean unmount, an index LEB could have a different
+ * amount of free space than the value recorded by lprops. That
+ * is because the in-the-gaps method may use free space or
+ * create free space (as a side-effect of using ubi_leb_change
+ * and not writing the whole LEB). The incorrect free space
+ * value is not a problem because the index is only ever
+ * allocated empty LEBs, so there will never be an attempt to
+ * write to the free space at the end of an index LEB - except
+ * by the in-the-gaps method for which it is not a problem.
+ */
+ free = lp->free;
+ dirty = lp->dirty;
+ }
+
+ if (lp->free != free || lp->dirty != dirty)
+ goto out_print;
+
+ if (is_idx && !(lp->flags & LPROPS_INDEX)) {
+ if (free == c->leb_size)
+ /* Free but not unmapped LEB, it's fine */
+ is_idx = 0;
+ else {
+ ubifs_err("indexing node without indexing "
+ "flag");
+ goto out_print;
+ }
+ }
+
+ if (!is_idx && (lp->flags & LPROPS_INDEX)) {
+ ubifs_err("data node with indexing flag");
+ goto out_print;
+ }
+
+ if (free == c->leb_size)
+ lst->empty_lebs += 1;
+
+ if (is_idx)
+ lst->idx_lebs += 1;
+
+ if (!(lp->flags & LPROPS_INDEX))
+ lst->total_used += c->leb_size - free - dirty;
+ lst->total_free += free;
+ lst->total_dirty += dirty;
+
+ if (!(lp->flags & LPROPS_INDEX)) {
+ int spc = free + dirty;
+
+ if (spc < c->dead_wm)
+ lst->total_dead += spc;
+ else
+ lst->total_dark += calc_dark(c, spc);
+ }
+
+ ubifs_scan_destroy(sleb);
+
+ return LPT_SCAN_CONTINUE;
+
+out_print:
+ ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, "
+ "should be free %d, dirty %d",
+ lnum, lp->free, lp->dirty, lp->flags, free, dirty);
+ dbg_dump_leb(c, lnum);
+out_destroy:
+ ubifs_scan_destroy(sleb);
+out:
+ data->err = -EINVAL;
+ return LPT_SCAN_STOP;
+}
+
+/**
+ * dbg_check_lprops - check all LEB properties.
+ * @c: UBIFS file-system description object
+ *
+ * This function checks all LEB properties and makes sure they are all correct.
+ * It returns zero if everything is fine, %-EINVAL if there is an inconsistency
+ * and other negative error codes in case of other errors. This function is
+ * called while the file system is locked (because of commit start), so no
+ * additional locking is required. Note that locking the LPT mutex would cause
+ * a circular lock dependency with the TNC mutex.
+ */
+int dbg_check_lprops(struct ubifs_info *c)
+{
+ int i, err;
+ struct scan_check_data data;
+ struct ubifs_lp_stats *lst = &data.lst;
+
+ if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
+ return 0;
+
+ /*
+ * As we are going to scan the media, the write buffers have to be
+ * synchronized.
+ */
+ for (i = 0; i < c->jhead_cnt; i++) {
+ err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
+ if (err)
+ return err;
+ }
+
+ memset(lst, 0, sizeof(struct ubifs_lp_stats));
+
+ data.err = 0;
+ err = ubifs_lpt_scan_nolock(c, c->main_first, c->leb_cnt - 1,
+ (ubifs_lpt_scan_callback)scan_check_cb,
+ &data);
+ if (err && err != -ENOSPC)
+ goto out;
+ if (data.err) {
+ err = data.err;
+ goto out;
+ }
+
+ if (lst->empty_lebs != c->lst.empty_lebs ||
+ lst->idx_lebs != c->lst.idx_lebs ||
+ lst->total_free != c->lst.total_free ||
+ lst->total_dirty != c->lst.total_dirty ||
+ lst->total_used != c->lst.total_used) {
+ ubifs_err("bad overall accounting");
+ ubifs_err("calculated: empty_lebs %d, idx_lebs %d, "
+ "total_free %lld, total_dirty %lld, total_used %lld",
+ lst->empty_lebs, lst->idx_lebs, lst->total_free,
+ lst->total_dirty, lst->total_used);
+ ubifs_err("read from lprops: empty_lebs %d, idx_lebs %d, "
+ "total_free %lld, total_dirty %lld, total_used %lld",
+ c->lst.empty_lebs, c->lst.idx_lebs, c->lst.total_free,
+ c->lst.total_dirty, c->lst.total_used);
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (lst->total_dead != c->lst.total_dead ||
+ lst->total_dark != c->lst.total_dark) {
+ ubifs_err("bad dead/dark space accounting");
+ ubifs_err("calculated: total_dead %lld, total_dark %lld",
+ lst->total_dead, lst->total_dark);
+ ubifs_err("read from lprops: total_dead %lld, total_dark %lld",
+ c->lst.total_dead, c->lst.total_dark);
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = dbg_check_cats(c);
+out:
+ return err;
+}
+
+#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
new file mode 100644
index 00000000000..9ff2463177e
--- /dev/null
+++ b/fs/ubifs/lpt.c
@@ -0,0 +1,2243 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ * Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements the LEB properties tree (LPT) area. The LPT area
+ * contains the LEB properties tree, a table of LPT area eraseblocks (ltab), and
+ * (for the "big" model) a table of saved LEB numbers (lsave). The LPT area sits
+ * between the log and the orphan area.
+ *
+ * The LPT area is like a miniature self-contained file system. It is required
+ * that it never runs out of space, is fast to access and update, and scales
+ * logarithmically. The LEB properties tree is implemented as a wandering tree
+ * much like the TNC, and the LPT area has its own garbage collection.
+ *
+ * The LPT has two slightly different forms called the "small model" and the
+ * "big model". The small model is used when the entire LEB properties table
+ * can be written into a single eraseblock. In that case, garbage collection
+ * consists of just writing the whole table, which therefore makes all other
+ * eraseblocks reusable. In the case of the big model, dirty eraseblocks are
+ * selected for garbage collection, which consists are marking the nodes in
+ * that LEB as dirty, and then only the dirty nodes are written out. Also, in
+ * the case of the big model, a table of LEB numbers is saved so that the entire
+ * LPT does not to be scanned looking for empty eraseblocks when UBIFS is first
+ * mounted.
+ */
+
+#include <linux/crc16.h>
+#include "ubifs.h"
+
+/**
+ * do_calc_lpt_geom - calculate sizes for the LPT area.
+ * @c: the UBIFS file-system description object
+ *
+ * Calculate the sizes of LPT bit fields, nodes, and tree, based on the
+ * properties of the flash and whether LPT is "big" (c->big_lpt).
+ */
+static void do_calc_lpt_geom(struct ubifs_info *c)
+{
+ int i, n, bits, per_leb_wastage, max_pnode_cnt;
+ long long sz, tot_wastage;
+
+ n = c->main_lebs + c->max_leb_cnt - c->leb_cnt;
+ max_pnode_cnt = DIV_ROUND_UP(n, UBIFS_LPT_FANOUT);
+
+ c->lpt_hght = 1;
+ n = UBIFS_LPT_FANOUT;
+ while (n < max_pnode_cnt) {
+ c->lpt_hght += 1;
+ n <<= UBIFS_LPT_FANOUT_SHIFT;
+ }
+
+ c->pnode_cnt = DIV_ROUND_UP(c->main_lebs, UBIFS_LPT_FANOUT);
+
+ n = DIV_ROUND_UP(c->pnode_cnt, UBIFS_LPT_FANOUT);
+ c->nnode_cnt = n;
+ for (i = 1; i < c->lpt_hght; i++) {
+ n = DIV_ROUND_UP(n, UBIFS_LPT_FANOUT);
+ c->nnode_cnt += n;
+ }
+
+ c->space_bits = fls(c->leb_size) - 3;
+ c->lpt_lnum_bits = fls(c->lpt_lebs);
+ c->lpt_offs_bits = fls(c->leb_size - 1);
+ c->lpt_spc_bits = fls(c->leb_size);
+
+ n = DIV_ROUND_UP(c->max_leb_cnt, UBIFS_LPT_FANOUT);
+ c->pcnt_bits = fls(n - 1);
+
+ c->lnum_bits = fls(c->max_leb_cnt - 1);
+
+ bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS +
+ (c->big_lpt ? c->pcnt_bits : 0) +
+ (c->space_bits * 2 + 1) * UBIFS_LPT_FANOUT;
+ c->pnode_sz = (bits + 7) / 8;
+
+ bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS +
+ (c->big_lpt ? c->pcnt_bits : 0) +
+ (c->lpt_lnum_bits + c->lpt_offs_bits) * UBIFS_LPT_FANOUT;
+ c->nnode_sz = (bits + 7) / 8;
+
+ bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS +
+ c->lpt_lebs * c->lpt_spc_bits * 2;
+ c->ltab_sz = (bits + 7) / 8;
+
+ bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS +
+ c->lnum_bits * c->lsave_cnt;
+ c->lsave_sz = (bits + 7) / 8;
+
+ /* Calculate the minimum LPT size */
+ c->lpt_sz = (long long)c->pnode_cnt * c->pnode_sz;
+ c->lpt_sz += (long long)c->nnode_cnt * c->nnode_sz;
+ c->lpt_sz += c->ltab_sz;
+ c->lpt_sz += c->lsave_sz;
+
+ /* Add wastage */
+ sz = c->lpt_sz;
+ per_leb_wastage = max_t(int, c->pnode_sz, c->nnode_sz);
+ sz += per_leb_wastage;
+ tot_wastage = per_leb_wastage;
+ while (sz > c->leb_size) {
+ sz += per_leb_wastage;
+ sz -= c->leb_size;
+ tot_wastage += per_leb_wastage;
+ }
+ tot_wastage += ALIGN(sz, c->min_io_size) - sz;
+ c->lpt_sz += tot_wastage;
+}
+
+/**
+ * ubifs_calc_lpt_geom - calculate and check sizes for the LPT area.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_calc_lpt_geom(struct ubifs_info *c)
+{
+ int lebs_needed;
+ uint64_t sz;
+
+ do_calc_lpt_geom(c);
+
+ /* Verify that lpt_lebs is big enough */
+ sz = c->lpt_sz * 2; /* Must have at least 2 times the size */
+ sz += c->leb_size - 1;
+ do_div(sz, c->leb_size);
+ lebs_needed = sz;
+ if (lebs_needed > c->lpt_lebs) {
+ ubifs_err("too few LPT LEBs");
+ return -EINVAL;
+ }
+
+ /* Verify that ltab fits in a single LEB (since ltab is a single node */
+ if (c->ltab_sz > c->leb_size) {
+ ubifs_err("LPT ltab too big");
+ return -EINVAL;
+ }
+
+ c->check_lpt_free = c->big_lpt;
+
+ return 0;
+}
+
+/**
+ * calc_dflt_lpt_geom - calculate default LPT geometry.
+ * @c: the UBIFS file-system description object
+ * @main_lebs: number of main area LEBs is passed and returned here
+ * @big_lpt: whether the LPT area is "big" is returned here
+ *
+ * The size of the LPT area depends on parameters that themselves are dependent
+ * on the size of the LPT area. This function, successively recalculates the LPT
+ * area geometry until the parameters and resultant geometry are consistent.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int calc_dflt_lpt_geom(struct ubifs_info *c, int *main_lebs,
+ int *big_lpt)
+{
+ int i, lebs_needed;
+ uint64_t sz;
+
+ /* Start by assuming the minimum number of LPT LEBs */
+ c->lpt_lebs = UBIFS_MIN_LPT_LEBS;
+ c->main_lebs = *main_lebs - c->lpt_lebs;
+ if (c->main_lebs <= 0)
+ return -EINVAL;
+
+ /* And assume we will use the small LPT model */
+ c->big_lpt = 0;
+
+ /*
+ * Calculate the geometry based on assumptions above and then see if it
+ * makes sense
+ */
+ do_calc_lpt_geom(c);
+
+ /* Small LPT model must have lpt_sz < leb_size */
+ if (c->lpt_sz > c->leb_size) {
+ /* Nope, so try again using big LPT model */
+ c->big_lpt = 1;
+ do_calc_lpt_geom(c);
+ }
+
+ /* Now check there are enough LPT LEBs */
+ for (i = 0; i < 64 ; i++) {
+ sz = c->lpt_sz * 4; /* Allow 4 times the size */
+ sz += c->leb_size - 1;
+ do_div(sz, c->leb_size);
+ lebs_needed = sz;
+ if (lebs_needed > c->lpt_lebs) {
+ /* Not enough LPT LEBs so try again with more */
+ c->lpt_lebs = lebs_needed;
+ c->main_lebs = *main_lebs - c->lpt_lebs;
+ if (c->main_lebs <= 0)
+ return -EINVAL;
+ do_calc_lpt_geom(c);
+ continue;
+ }
+ if (c->ltab_sz > c->leb_size) {
+ ubifs_err("LPT ltab too big");
+ return -EINVAL;
+ }
+ *main_lebs = c->main_lebs;
+ *big_lpt = c->big_lpt;
+ return 0;
+ }
+ return -EINVAL;
+}
+
+/**
+ * pack_bits - pack bit fields end-to-end.
+ * @addr: address at which to pack (passed and next address returned)
+ * @pos: bit position at which to pack (passed and next position returned)
+ * @val: value to pack
+ * @nrbits: number of bits of value to pack (1-32)
+ */
+static void pack_bits(uint8_t **addr, int *pos, uint32_t val, int nrbits)
+{
+ uint8_t *p = *addr;
+ int b = *pos;
+
+ ubifs_assert(nrbits > 0);
+ ubifs_assert(nrbits <= 32);
+ ubifs_assert(*pos >= 0);
+ ubifs_assert(*pos < 8);
+ ubifs_assert((val >> nrbits) == 0 || nrbits == 32);
+ if (b) {
+ *p |= ((uint8_t)val) << b;
+ nrbits += b;
+ if (nrbits > 8) {
+ *++p = (uint8_t)(val >>= (8 - b));
+ if (nrbits > 16) {
+ *++p = (uint8_t)(val >>= 8);
+ if (nrbits > 24) {
+ *++p = (uint8_t)(val >>= 8);
+ if (nrbits > 32)
+ *++p = (uint8_t)(val >>= 8);
+ }
+ }
+ }
+ } else {
+ *p = (uint8_t)val;
+ if (nrbits > 8) {
+ *++p = (uint8_t)(val >>= 8);
+ if (nrbits > 16) {
+ *++p = (uint8_t)(val >>= 8);
+ if (nrbits > 24)
+ *++p = (uint8_t)(val >>= 8);
+ }
+ }
+ }
+ b = nrbits & 7;
+ if (b == 0)
+ p++;
+ *addr = p;
+ *pos = b;
+}
+
+/**
+ * ubifs_unpack_bits - unpack bit fields.
+ * @addr: address at which to unpack (passed and next address returned)
+ * @pos: bit position at which to unpack (passed and next position returned)
+ * @nrbits: number of bits of value to unpack (1-32)
+ *
+ * This functions returns the value unpacked.
+ */
+uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits)
+{
+ const int k = 32 - nrbits;
+ uint8_t *p = *addr;
+ int b = *pos;
+ uint32_t val;
+
+ ubifs_assert(nrbits > 0);
+ ubifs_assert(nrbits <= 32);
+ ubifs_assert(*pos >= 0);
+ ubifs_assert(*pos < 8);
+ if (b) {
+ val = p[1] | ((uint32_t)p[2] << 8) | ((uint32_t)p[3] << 16) |
+ ((uint32_t)p[4] << 24);
+ val <<= (8 - b);
+ val |= *p >> b;
+ nrbits += b;
+ } else
+ val = p[0] | ((uint32_t)p[1] << 8) | ((uint32_t)p[2] << 16) |
+ ((uint32_t)p[3] << 24);
+ val <<= k;
+ val >>= k;
+ b = nrbits & 7;
+ p += nrbits / 8;
+ *addr = p;
+ *pos = b;
+ ubifs_assert((val >> nrbits) == 0 || nrbits - b == 32);
+ return val;
+}
+
+/**
+ * ubifs_pack_pnode - pack all the bit fields of a pnode.
+ * @c: UBIFS file-system description object
+ * @buf: buffer into which to pack
+ * @pnode: pnode to pack
+ */
+void ubifs_pack_pnode(struct ubifs_info *c, void *buf,
+ struct ubifs_pnode *pnode)
+{
+ uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+ int i, pos = 0;
+ uint16_t crc;
+
+ pack_bits(&addr, &pos, UBIFS_LPT_PNODE, UBIFS_LPT_TYPE_BITS);
+ if (c->big_lpt)
+ pack_bits(&addr, &pos, pnode->num, c->pcnt_bits);
+ for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+ pack_bits(&addr, &pos, pnode->lprops[i].free >> 3,
+ c->space_bits);
+ pack_bits(&addr, &pos, pnode->lprops[i].dirty >> 3,
+ c->space_bits);
+ if (pnode->lprops[i].flags & LPROPS_INDEX)
+ pack_bits(&addr, &pos, 1, 1);
+ else
+ pack_bits(&addr, &pos, 0, 1);
+ }
+ crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
+ c->pnode_sz - UBIFS_LPT_CRC_BYTES);
+ addr = buf;
+ pos = 0;
+ pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS);
+}
+
+/**
+ * ubifs_pack_nnode - pack all the bit fields of a nnode.
+ * @c: UBIFS file-system description object
+ * @buf: buffer into which to pack
+ * @nnode: nnode to pack
+ */
+void ubifs_pack_nnode(struct ubifs_info *c, void *buf,
+ struct ubifs_nnode *nnode)
+{
+ uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+ int i, pos = 0;
+ uint16_t crc;
+
+ pack_bits(&addr, &pos, UBIFS_LPT_NNODE, UBIFS_LPT_TYPE_BITS);
+ if (c->big_lpt)
+ pack_bits(&addr, &pos, nnode->num, c->pcnt_bits);
+ for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+ int lnum = nnode->nbranch[i].lnum;
+
+ if (lnum == 0)
+ lnum = c->lpt_last + 1;
+ pack_bits(&addr, &pos, lnum - c->lpt_first, c->lpt_lnum_bits);
+ pack_bits(&addr, &pos, nnode->nbranch[i].offs,
+ c->lpt_offs_bits);
+ }
+ crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
+ c->nnode_sz - UBIFS_LPT_CRC_BYTES);
+ addr = buf;
+ pos = 0;
+ pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS);
+}
+
+/**
+ * ubifs_pack_ltab - pack the LPT's own lprops table.
+ * @c: UBIFS file-system description object
+ * @buf: buffer into which to pack
+ * @ltab: LPT's own lprops table to pack
+ */
+void ubifs_pack_ltab(struct ubifs_info *c, void *buf,
+ struct ubifs_lpt_lprops *ltab)
+{
+ uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+ int i, pos = 0;
+ uint16_t crc;
+
+ pack_bits(&addr, &pos, UBIFS_LPT_LTAB, UBIFS_LPT_TYPE_BITS);
+ for (i = 0; i < c->lpt_lebs; i++) {
+ pack_bits(&addr, &pos, ltab[i].free, c->lpt_spc_bits);
+ pack_bits(&addr, &pos, ltab[i].dirty, c->lpt_spc_bits);
+ }
+ crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
+ c->ltab_sz - UBIFS_LPT_CRC_BYTES);
+ addr = buf;
+ pos = 0;
+ pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS);
+}
+
+/**
+ * ubifs_pack_lsave - pack the LPT's save table.
+ * @c: UBIFS file-system description object
+ * @buf: buffer into which to pack
+ * @lsave: LPT's save table to pack
+ */
+void ubifs_pack_lsave(struct ubifs_info *c, void *buf, int *lsave)
+{
+ uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+ int i, pos = 0;
+ uint16_t crc;
+
+ pack_bits(&addr, &pos, UBIFS_LPT_LSAVE, UBIFS_LPT_TYPE_BITS);
+ for (i = 0; i < c->lsave_cnt; i++)
+ pack_bits(&addr, &pos, lsave[i], c->lnum_bits);
+ crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
+ c->lsave_sz - UBIFS_LPT_CRC_BYTES);
+ addr = buf;
+ pos = 0;
+ pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS);
+}
+
+/**
+ * ubifs_add_lpt_dirt - add dirty space to LPT LEB properties.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to which to add dirty space
+ * @dirty: amount of dirty space to add
+ */
+void ubifs_add_lpt_dirt(struct ubifs_info *c, int lnum, int dirty)
+{
+ if (!dirty || !lnum)
+ return;
+ dbg_lp("LEB %d add %d to %d",
+ lnum, dirty, c->ltab[lnum - c->lpt_first].dirty);
+ ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last);
+ c->ltab[lnum - c->lpt_first].dirty += dirty;
+}
+
+/**
+ * set_ltab - set LPT LEB properties.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number
+ * @free: amount of free space
+ * @dirty: amount of dirty space
+ */
+static void set_ltab(struct ubifs_info *c, int lnum, int free, int dirty)
+{
+ dbg_lp("LEB %d free %d dirty %d to %d %d",
+ lnum, c->ltab[lnum - c->lpt_first].free,
+ c->ltab[lnum - c->lpt_first].dirty, free, dirty);
+ ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last);
+ c->ltab[lnum - c->lpt_first].free = free;
+ c->ltab[lnum - c->lpt_first].dirty = dirty;
+}
+
+/**
+ * ubifs_add_nnode_dirt - add dirty space to LPT LEB properties.
+ * @c: UBIFS file-system description object
+ * @nnode: nnode for which to add dirt
+ */
+void ubifs_add_nnode_dirt(struct ubifs_info *c, struct ubifs_nnode *nnode)
+{
+ struct ubifs_nnode *np = nnode->parent;
+
+ if (np)
+ ubifs_add_lpt_dirt(c, np->nbranch[nnode->iip].lnum,
+ c->nnode_sz);
+ else {
+ ubifs_add_lpt_dirt(c, c->lpt_lnum, c->nnode_sz);
+ if (!(c->lpt_drty_flgs & LTAB_DIRTY)) {
+ c->lpt_drty_flgs |= LTAB_DIRTY;
+ ubifs_add_lpt_dirt(c, c->ltab_lnum, c->ltab_sz);
+ }
+ }
+}
+
+/**
+ * add_pnode_dirt - add dirty space to LPT LEB properties.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode for which to add dirt
+ */
+static void add_pnode_dirt(struct ubifs_info *c, struct ubifs_pnode *pnode)
+{
+ ubifs_add_lpt_dirt(c, pnode->parent->nbranch[pnode->iip].lnum,
+ c->pnode_sz);
+}
+
+/**
+ * calc_nnode_num - calculate nnode number.
+ * @row: the row in the tree (root is zero)
+ * @col: the column in the row (leftmost is zero)
+ *
+ * The nnode number is a number that uniquely identifies a nnode and can be used
+ * easily to traverse the tree from the root to that nnode.
+ *
+ * This function calculates and returns the nnode number for the nnode at @row
+ * and @col.
+ */
+static int calc_nnode_num(int row, int col)
+{
+ int num, bits;
+
+ num = 1;
+ while (row--) {
+ bits = (col & (UBIFS_LPT_FANOUT - 1));
+ col >>= UBIFS_LPT_FANOUT_SHIFT;
+ num <<= UBIFS_LPT_FANOUT_SHIFT;
+ num |= bits;
+ }
+ return num;
+}
+
+/**
+ * calc_nnode_num_from_parent - calculate nnode number.
+ * @c: UBIFS file-system description object
+ * @parent: parent nnode
+ * @iip: index in parent
+ *
+ * The nnode number is a number that uniquely identifies a nnode and can be used
+ * easily to traverse the tree from the root to that nnode.
+ *
+ * This function calculates and returns the nnode number based on the parent's
+ * nnode number and the index in parent.
+ */
+static int calc_nnode_num_from_parent(struct ubifs_info *c,
+ struct ubifs_nnode *parent, int iip)
+{
+ int num, shft;
+
+ if (!parent)
+ return 1;
+ shft = (c->lpt_hght - parent->level) * UBIFS_LPT_FANOUT_SHIFT;
+ num = parent->num ^ (1 << shft);
+ num |= (UBIFS_LPT_FANOUT + iip) << shft;
+ return num;
+}
+
+/**
+ * calc_pnode_num_from_parent - calculate pnode number.
+ * @c: UBIFS file-system description object
+ * @parent: parent nnode
+ * @iip: index in parent
+ *
+ * The pnode number is a number that uniquely identifies a pnode and can be used
+ * easily to traverse the tree from the root to that pnode.
+ *
+ * This function calculates and returns the pnode number based on the parent's
+ * nnode number and the index in parent.
+ */
+static int calc_pnode_num_from_parent(struct ubifs_info *c,
+ struct ubifs_nnode *parent, int iip)
+{
+ int i, n = c->lpt_hght - 1, pnum = parent->num, num = 0;
+
+ for (i = 0; i < n; i++) {
+ num <<= UBIFS_LPT_FANOUT_SHIFT;
+ num |= pnum & (UBIFS_LPT_FANOUT - 1);
+ pnum >>= UBIFS_LPT_FANOUT_SHIFT;
+ }
+ num <<= UBIFS_LPT_FANOUT_SHIFT;
+ num |= iip;
+ return num;
+}
+
+/**
+ * ubifs_create_dflt_lpt - create default LPT.
+ * @c: UBIFS file-system description object
+ * @main_lebs: number of main area LEBs is passed and returned here
+ * @lpt_first: LEB number of first LPT LEB
+ * @lpt_lebs: number of LEBs for LPT is passed and returned here
+ * @big_lpt: use big LPT model is passed and returned here
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first,
+ int *lpt_lebs, int *big_lpt)
+{
+ int lnum, err = 0, node_sz, iopos, i, j, cnt, len, alen, row;
+ int blnum, boffs, bsz, bcnt;
+ struct ubifs_pnode *pnode = NULL;
+ struct ubifs_nnode *nnode = NULL;
+ void *buf = NULL, *p;
+ struct ubifs_lpt_lprops *ltab = NULL;
+ int *lsave = NULL;
+
+ err = calc_dflt_lpt_geom(c, main_lebs, big_lpt);
+ if (err)
+ return err;
+ *lpt_lebs = c->lpt_lebs;
+
+ /* Needed by 'ubifs_pack_nnode()' and 'set_ltab()' */
+ c->lpt_first = lpt_first;
+ /* Needed by 'set_ltab()' */
+ c->lpt_last = lpt_first + c->lpt_lebs - 1;
+ /* Needed by 'ubifs_pack_lsave()' */
+ c->main_first = c->leb_cnt - *main_lebs;
+
+ lsave = kmalloc(sizeof(int) * c->lsave_cnt, GFP_KERNEL);
+ pnode = kzalloc(sizeof(struct ubifs_pnode), GFP_KERNEL);
+ nnode = kzalloc(sizeof(struct ubifs_nnode), GFP_KERNEL);
+ buf = vmalloc(c->leb_size);
+ ltab = vmalloc(sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs);
+ if (!pnode || !nnode || !buf || !ltab || !lsave) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ ubifs_assert(!c->ltab);
+ c->ltab = ltab; /* Needed by set_ltab */
+
+ /* Initialize LPT's own lprops */
+ for (i = 0; i < c->lpt_lebs; i++) {
+ ltab[i].free = c->leb_size;
+ ltab[i].dirty = 0;
+ ltab[i].tgc = 0;
+ ltab[i].cmt = 0;
+ }
+
+ lnum = lpt_first;
+ p = buf;
+ /* Number of leaf nodes (pnodes) */
+ cnt = c->pnode_cnt;
+
+ /*
+ * The first pnode contains the LEB properties for the LEBs that contain
+ * the root inode node and the root index node of the index tree.
+ */
+ node_sz = ALIGN(ubifs_idx_node_sz(c, 1), 8);
+ iopos = ALIGN(node_sz, c->min_io_size);
+ pnode->lprops[0].free = c->leb_size - iopos;
+ pnode->lprops[0].dirty = iopos - node_sz;
+ pnode->lprops[0].flags = LPROPS_INDEX;
+
+ node_sz = UBIFS_INO_NODE_SZ;
+ iopos = ALIGN(node_sz, c->min_io_size);
+ pnode->lprops[1].free = c->leb_size - iopos;
+ pnode->lprops[1].dirty = iopos - node_sz;
+
+ for (i = 2; i < UBIFS_LPT_FANOUT; i++)
+ pnode->lprops[i].free = c->leb_size;
+
+ /* Add first pnode */
+ ubifs_pack_pnode(c, p, pnode);
+ p += c->pnode_sz;
+ len = c->pnode_sz;
+ pnode->num += 1;
+
+ /* Reset pnode values for remaining pnodes */
+ pnode->lprops[0].free = c->leb_size;
+ pnode->lprops[0].dirty = 0;
+ pnode->lprops[0].flags = 0;
+
+ pnode->lprops[1].free = c->leb_size;
+ pnode->lprops[1].dirty = 0;
+
+ /*
+ * To calculate the internal node branches, we keep information about
+ * the level below.
+ */
+ blnum = lnum; /* LEB number of level below */
+ boffs = 0; /* Offset of level below */
+ bcnt = cnt; /* Number of nodes in level below */
+ bsz = c->pnode_sz; /* Size of nodes in level below */
+
+ /* Add all remaining pnodes */
+ for (i = 1; i < cnt; i++) {
+ if (len + c->pnode_sz > c->leb_size) {
+ alen = ALIGN(len, c->min_io_size);
+ set_ltab(c, lnum, c->leb_size - alen, alen - len);
+ memset(p, 0xff, alen - len);
+ err = ubi_leb_change(c->ubi, lnum++, buf, alen,
+ UBI_SHORTTERM);
+ if (err)
+ goto out;
+ p = buf;
+ len = 0;
+ }
+ ubifs_pack_pnode(c, p, pnode);
+ p += c->pnode_sz;
+ len += c->pnode_sz;
+ /*
+ * pnodes are simply numbered left to right starting at zero,
+ * which means the pnode number can be used easily to traverse
+ * down the tree to the corresponding pnode.
+ */
+ pnode->num += 1;
+ }
+
+ row = 0;
+ for (i = UBIFS_LPT_FANOUT; cnt > i; i <<= UBIFS_LPT_FANOUT_SHIFT)
+ row += 1;
+ /* Add all nnodes, one level at a time */
+ while (1) {
+ /* Number of internal nodes (nnodes) at next level */
+ cnt = DIV_ROUND_UP(cnt, UBIFS_LPT_FANOUT);
+ for (i = 0; i < cnt; i++) {
+ if (len + c->nnode_sz > c->leb_size) {
+ alen = ALIGN(len, c->min_io_size);
+ set_ltab(c, lnum, c->leb_size - alen,
+ alen - len);
+ memset(p, 0xff, alen - len);
+ err = ubi_leb_change(c->ubi, lnum++, buf, alen,
+ UBI_SHORTTERM);
+ if (err)
+ goto out;
+ p = buf;
+ len = 0;
+ }
+ /* Only 1 nnode at this level, so it is the root */
+ if (cnt == 1) {
+ c->lpt_lnum = lnum;
+ c->lpt_offs = len;
+ }
+ /* Set branches to the level below */
+ for (j = 0; j < UBIFS_LPT_FANOUT; j++) {
+ if (bcnt) {
+ if (boffs + bsz > c->leb_size) {
+ blnum += 1;
+ boffs = 0;
+ }
+ nnode->nbranch[j].lnum = blnum;
+ nnode->nbranch[j].offs = boffs;
+ boffs += bsz;
+ bcnt--;
+ } else {
+ nnode->nbranch[j].lnum = 0;
+ nnode->nbranch[j].offs = 0;
+ }
+ }
+ nnode->num = calc_nnode_num(row, i);
+ ubifs_pack_nnode(c, p, nnode);
+ p += c->nnode_sz;
+ len += c->nnode_sz;
+ }
+ /* Only 1 nnode at this level, so it is the root */
+ if (cnt == 1)
+ break;
+ /* Update the information about the level below */
+ bcnt = cnt;
+ bsz = c->nnode_sz;
+ row -= 1;
+ }
+
+ if (*big_lpt) {
+ /* Need to add LPT's save table */
+ if (len + c->lsave_sz > c->leb_size) {
+ alen = ALIGN(len, c->min_io_size);
+ set_ltab(c, lnum, c->leb_size - alen, alen - len);
+ memset(p, 0xff, alen - len);
+ err = ubi_leb_change(c->ubi, lnum++, buf, alen,
+ UBI_SHORTTERM);
+ if (err)
+ goto out;
+ p = buf;
+ len = 0;
+ }
+
+ c->lsave_lnum = lnum;
+ c->lsave_offs = len;
+
+ for (i = 0; i < c->lsave_cnt && i < *main_lebs; i++)
+ lsave[i] = c->main_first + i;
+ for (; i < c->lsave_cnt; i++)
+ lsave[i] = c->main_first;
+
+ ubifs_pack_lsave(c, p, lsave);
+ p += c->lsave_sz;
+ len += c->lsave_sz;
+ }
+
+ /* Need to add LPT's own LEB properties table */
+ if (len + c->ltab_sz > c->leb_size) {
+ alen = ALIGN(len, c->min_io_size);
+ set_ltab(c, lnum, c->leb_size - alen, alen - len);
+ memset(p, 0xff, alen - len);
+ err = ubi_leb_change(c->ubi, lnum++, buf, alen, UBI_SHORTTERM);
+ if (err)
+ goto out;
+ p = buf;
+ len = 0;
+ }
+
+ c->ltab_lnum = lnum;
+ c->ltab_offs = len;
+
+ /* Update ltab before packing it */
+ len += c->ltab_sz;
+ alen = ALIGN(len, c->min_io_size);
+ set_ltab(c, lnum, c->leb_size - alen, alen - len);
+
+ ubifs_pack_ltab(c, p, ltab);
+ p += c->ltab_sz;
+
+ /* Write remaining buffer */
+ memset(p, 0xff, alen - len);
+ err = ubi_leb_change(c->ubi, lnum, buf, alen, UBI_SHORTTERM);
+ if (err)
+ goto out;
+
+ c->nhead_lnum = lnum;
+ c->nhead_offs = ALIGN(len, c->min_io_size);
+
+ dbg_lp("space_bits %d", c->space_bits);
+ dbg_lp("lpt_lnum_bits %d", c->lpt_lnum_bits);
+ dbg_lp("lpt_offs_bits %d", c->lpt_offs_bits);
+ dbg_lp("lpt_spc_bits %d", c->lpt_spc_bits);
+ dbg_lp("pcnt_bits %d", c->pcnt_bits);
+ dbg_lp("lnum_bits %d", c->lnum_bits);
+ dbg_lp("pnode_sz %d", c->pnode_sz);
+ dbg_lp("nnode_sz %d", c->nnode_sz);
+ dbg_lp("ltab_sz %d", c->ltab_sz);
+ dbg_lp("lsave_sz %d", c->lsave_sz);
+ dbg_lp("lsave_cnt %d", c->lsave_cnt);
+ dbg_lp("lpt_hght %d", c->lpt_hght);
+ dbg_lp("big_lpt %d", c->big_lpt);
+ dbg_lp("LPT root is at %d:%d", c->lpt_lnum, c->lpt_offs);
+ dbg_lp("LPT head is at %d:%d", c->nhead_lnum, c->nhead_offs);
+ dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs);
+ if (c->big_lpt)
+ dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs);
+out:
+ c->ltab = NULL;
+ kfree(lsave);
+ vfree(ltab);
+ vfree(buf);
+ kfree(nnode);
+ kfree(pnode);
+ return err;
+}
+
+/**
+ * update_cats - add LEB properties of a pnode to LEB category lists and heaps.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode
+ *
+ * When a pnode is loaded into memory, the LEB properties it contains are added,
+ * by this function, to the LEB category lists and heaps.
+ */
+static void update_cats(struct ubifs_info *c, struct ubifs_pnode *pnode)
+{
+ int i;
+
+ for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+ int cat = pnode->lprops[i].flags & LPROPS_CAT_MASK;
+ int lnum = pnode->lprops[i].lnum;
+
+ if (!lnum)
+ return;
+ ubifs_add_to_cat(c, &pnode->lprops[i], cat);
+ }
+}
+
+/**
+ * replace_cats - add LEB properties of a pnode to LEB category lists and heaps.
+ * @c: UBIFS file-system description object
+ * @old_pnode: pnode copied
+ * @new_pnode: pnode copy
+ *
+ * During commit it is sometimes necessary to copy a pnode
+ * (see dirty_cow_pnode). When that happens, references in
+ * category lists and heaps must be replaced. This function does that.
+ */
+static void replace_cats(struct ubifs_info *c, struct ubifs_pnode *old_pnode,
+ struct ubifs_pnode *new_pnode)
+{
+ int i;
+
+ for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+ if (!new_pnode->lprops[i].lnum)
+ return;
+ ubifs_replace_cat(c, &old_pnode->lprops[i],
+ &new_pnode->lprops[i]);
+ }
+}
+
+/**
+ * check_lpt_crc - check LPT node crc is correct.
+ * @c: UBIFS file-system description object
+ * @buf: buffer containing node
+ * @len: length of node
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int check_lpt_crc(void *buf, int len)
+{
+ int pos = 0;
+ uint8_t *addr = buf;
+ uint16_t crc, calc_crc;
+
+ crc = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_CRC_BITS);
+ calc_crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
+ len - UBIFS_LPT_CRC_BYTES);
+ if (crc != calc_crc) {
+ ubifs_err("invalid crc in LPT node: crc %hx calc %hx", crc,
+ calc_crc);
+ dbg_dump_stack();
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/**
+ * check_lpt_type - check LPT node type is correct.
+ * @c: UBIFS file-system description object
+ * @addr: address of type bit field is passed and returned updated here
+ * @pos: position of type bit field is passed and returned updated here
+ * @type: expected type
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int check_lpt_type(uint8_t **addr, int *pos, int type)
+{
+ int node_type;
+
+ node_type = ubifs_unpack_bits(addr, pos, UBIFS_LPT_TYPE_BITS);
+ if (node_type != type) {
+ ubifs_err("invalid type (%d) in LPT node type %d", node_type,
+ type);
+ dbg_dump_stack();
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/**
+ * unpack_pnode - unpack a pnode.
+ * @c: UBIFS file-system description object
+ * @buf: buffer containing packed pnode to unpack
+ * @pnode: pnode structure to fill
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int unpack_pnode(struct ubifs_info *c, void *buf,
+ struct ubifs_pnode *pnode)
+{
+ uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+ int i, pos = 0, err;
+
+ err = check_lpt_type(&addr, &pos, UBIFS_LPT_PNODE);
+ if (err)
+ return err;
+ if (c->big_lpt)
+ pnode->num = ubifs_unpack_bits(&addr, &pos, c->pcnt_bits);
+ for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+ struct ubifs_lprops * const lprops = &pnode->lprops[i];
+
+ lprops->free = ubifs_unpack_bits(&addr, &pos, c->space_bits);
+ lprops->free <<= 3;
+ lprops->dirty = ubifs_unpack_bits(&addr, &pos, c->space_bits);
+ lprops->dirty <<= 3;
+
+ if (ubifs_unpack_bits(&addr, &pos, 1))
+ lprops->flags = LPROPS_INDEX;
+ else
+ lprops->flags = 0;
+ lprops->flags |= ubifs_categorize_lprops(c, lprops);
+ }
+ err = check_lpt_crc(buf, c->pnode_sz);
+ return err;
+}
+
+/**
+ * unpack_nnode - unpack a nnode.
+ * @c: UBIFS file-system description object
+ * @buf: buffer containing packed nnode to unpack
+ * @nnode: nnode structure to fill
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int unpack_nnode(struct ubifs_info *c, void *buf,
+ struct ubifs_nnode *nnode)
+{
+ uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+ int i, pos = 0, err;
+
+ err = check_lpt_type(&addr, &pos, UBIFS_LPT_NNODE);
+ if (err)
+ return err;
+ if (c->big_lpt)
+ nnode->num = ubifs_unpack_bits(&addr, &pos, c->pcnt_bits);
+ for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+ int lnum;
+
+ lnum = ubifs_unpack_bits(&addr, &pos, c->lpt_lnum_bits) +
+ c->lpt_first;
+ if (lnum == c->lpt_last + 1)
+ lnum = 0;
+ nnode->nbranch[i].lnum = lnum;
+ nnode->nbranch[i].offs = ubifs_unpack_bits(&addr, &pos,
+ c->lpt_offs_bits);
+ }
+ err = check_lpt_crc(buf, c->nnode_sz);
+ return err;
+}
+
+/**
+ * unpack_ltab - unpack the LPT's own lprops table.
+ * @c: UBIFS file-system description object
+ * @buf: buffer from which to unpack
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int unpack_ltab(struct ubifs_info *c, void *buf)
+{
+ uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+ int i, pos = 0, err;
+
+ err = check_lpt_type(&addr, &pos, UBIFS_LPT_LTAB);
+ if (err)
+ return err;
+ for (i = 0; i < c->lpt_lebs; i++) {
+ int free = ubifs_unpack_bits(&addr, &pos, c->lpt_spc_bits);
+ int dirty = ubifs_unpack_bits(&addr, &pos, c->lpt_spc_bits);
+
+ if (free < 0 || free > c->leb_size || dirty < 0 ||
+ dirty > c->leb_size || free + dirty > c->leb_size)
+ return -EINVAL;
+
+ c->ltab[i].free = free;
+ c->ltab[i].dirty = dirty;
+ c->ltab[i].tgc = 0;
+ c->ltab[i].cmt = 0;
+ }
+ err = check_lpt_crc(buf, c->ltab_sz);
+ return err;
+}
+
+/**
+ * unpack_lsave - unpack the LPT's save table.
+ * @c: UBIFS file-system description object
+ * @buf: buffer from which to unpack
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int unpack_lsave(struct ubifs_info *c, void *buf)
+{
+ uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+ int i, pos = 0, err;
+
+ err = check_lpt_type(&addr, &pos, UBIFS_LPT_LSAVE);
+ if (err)
+ return err;
+ for (i = 0; i < c->lsave_cnt; i++) {
+ int lnum = ubifs_unpack_bits(&addr, &pos, c->lnum_bits);
+
+ if (lnum < c->main_first || lnum >= c->leb_cnt)
+ return -EINVAL;
+ c->lsave[i] = lnum;
+ }
+ err = check_lpt_crc(buf, c->lsave_sz);
+ return err;
+}
+
+/**
+ * validate_nnode - validate a nnode.
+ * @c: UBIFS file-system description object
+ * @nnode: nnode to validate
+ * @parent: parent nnode (or NULL for the root nnode)
+ * @iip: index in parent
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int validate_nnode(struct ubifs_info *c, struct ubifs_nnode *nnode,
+ struct ubifs_nnode *parent, int iip)
+{
+ int i, lvl, max_offs;
+
+ if (c->big_lpt) {
+ int num = calc_nnode_num_from_parent(c, parent, iip);
+
+ if (nnode->num != num)
+ return -EINVAL;
+ }
+ lvl = parent ? parent->level - 1 : c->lpt_hght;
+ if (lvl < 1)
+ return -EINVAL;
+ if (lvl == 1)
+ max_offs = c->leb_size - c->pnode_sz;
+ else
+ max_offs = c->leb_size - c->nnode_sz;
+ for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+ int lnum = nnode->nbranch[i].lnum;
+ int offs = nnode->nbranch[i].offs;
+
+ if (lnum == 0) {
+ if (offs != 0)
+ return -EINVAL;
+ continue;
+ }
+ if (lnum < c->lpt_first || lnum > c->lpt_last)
+ return -EINVAL;
+ if (offs < 0 || offs > max_offs)
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/**
+ * validate_pnode - validate a pnode.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode to validate
+ * @parent: parent nnode
+ * @iip: index in parent
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int validate_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
+ struct ubifs_nnode *parent, int iip)
+{
+ int i;
+
+ if (c->big_lpt) {
+ int num = calc_pnode_num_from_parent(c, parent, iip);
+
+ if (pnode->num != num)
+ return -EINVAL;
+ }
+ for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+ int free = pnode->lprops[i].free;
+ int dirty = pnode->lprops[i].dirty;
+
+ if (free < 0 || free > c->leb_size || free % c->min_io_size ||
+ (free & 7))
+ return -EINVAL;
+ if (dirty < 0 || dirty > c->leb_size || (dirty & 7))
+ return -EINVAL;
+ if (dirty + free > c->leb_size)
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/**
+ * set_pnode_lnum - set LEB numbers on a pnode.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode to update
+ *
+ * This function calculates the LEB numbers for the LEB properties it contains
+ * based on the pnode number.
+ */
+static void set_pnode_lnum(struct ubifs_info *c, struct ubifs_pnode *pnode)
+{
+ int i, lnum;
+
+ lnum = (pnode->num << UBIFS_LPT_FANOUT_SHIFT) + c->main_first;
+ for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+ if (lnum >= c->leb_cnt)
+ return;
+ pnode->lprops[i].lnum = lnum++;
+ }
+}
+
+/**
+ * ubifs_read_nnode - read a nnode from flash and link it to the tree in memory.
+ * @c: UBIFS file-system description object
+ * @parent: parent nnode (or NULL for the root)
+ * @iip: index in parent
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip)
+{
+ struct ubifs_nbranch *branch = NULL;
+ struct ubifs_nnode *nnode = NULL;
+ void *buf = c->lpt_nod_buf;
+ int err, lnum, offs;
+
+ if (parent) {
+ branch = &parent->nbranch[iip];
+ lnum = branch->lnum;
+ offs = branch->offs;
+ } else {
+ lnum = c->lpt_lnum;
+ offs = c->lpt_offs;
+ }
+ nnode = kzalloc(sizeof(struct ubifs_nnode), GFP_NOFS);
+ if (!nnode) {
+ err = -ENOMEM;
+ goto out;
+ }
+ if (lnum == 0) {
+ /*
+ * This nnode was not written which just means that the LEB
+ * properties in the subtree below it describe empty LEBs. We
+ * make the nnode as though we had read it, which in fact means
+ * doing almost nothing.
+ */
+ if (c->big_lpt)
+ nnode->num = calc_nnode_num_from_parent(c, parent, iip);
+ } else {
+ err = ubi_read(c->ubi, lnum, buf, offs, c->nnode_sz);
+ if (err)
+ goto out;
+ err = unpack_nnode(c, buf, nnode);
+ if (err)
+ goto out;
+ }
+ err = validate_nnode(c, nnode, parent, iip);
+ if (err)
+ goto out;
+ if (!c->big_lpt)
+ nnode->num = calc_nnode_num_from_parent(c, parent, iip);
+ if (parent) {
+ branch->nnode = nnode;
+ nnode->level = parent->level - 1;
+ } else {
+ c->nroot = nnode;
+ nnode->level = c->lpt_hght;
+ }
+ nnode->parent = parent;
+ nnode->iip = iip;
+ return 0;
+
+out:
+ ubifs_err("error %d reading nnode at %d:%d", err, lnum, offs);
+ kfree(nnode);
+ return err;
+}
+
+/**
+ * read_pnode - read a pnode from flash and link it to the tree in memory.
+ * @c: UBIFS file-system description object
+ * @parent: parent nnode
+ * @iip: index in parent
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int read_pnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip)
+{
+ struct ubifs_nbranch *branch;
+ struct ubifs_pnode *pnode = NULL;
+ void *buf = c->lpt_nod_buf;
+ int err, lnum, offs;
+
+ branch = &parent->nbranch[iip];
+ lnum = branch->lnum;
+ offs = branch->offs;
+ pnode = kzalloc(sizeof(struct ubifs_pnode), GFP_NOFS);
+ if (!pnode) {
+ err = -ENOMEM;
+ goto out;
+ }
+ if (lnum == 0) {
+ /*
+ * This pnode was not written which just means that the LEB
+ * properties in it describe empty LEBs. We make the pnode as
+ * though we had read it.
+ */
+ int i;
+
+ if (c->big_lpt)
+ pnode->num = calc_pnode_num_from_parent(c, parent, iip);
+ for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+ struct ubifs_lprops * const lprops = &pnode->lprops[i];
+
+ lprops->free = c->leb_size;
+ lprops->flags = ubifs_categorize_lprops(c, lprops);
+ }
+ } else {
+ err = ubi_read(c->ubi, lnum, buf, offs, c->pnode_sz);
+ if (err)
+ goto out;
+ err = unpack_pnode(c, buf, pnode);
+ if (err)
+ goto out;
+ }
+ err = validate_pnode(c, pnode, parent, iip);
+ if (err)
+ goto out;
+ if (!c->big_lpt)
+ pnode->num = calc_pnode_num_from_parent(c, parent, iip);
+ branch->pnode = pnode;
+ pnode->parent = parent;
+ pnode->iip = iip;
+ set_pnode_lnum(c, pnode);
+ c->pnodes_have += 1;
+ return 0;
+
+out:
+ ubifs_err("error %d reading pnode at %d:%d", err, lnum, offs);
+ dbg_dump_pnode(c, pnode, parent, iip);
+ dbg_msg("calc num: %d", calc_pnode_num_from_parent(c, parent, iip));
+ kfree(pnode);
+ return err;
+}
+
+/**
+ * read_ltab - read LPT's own lprops table.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int read_ltab(struct ubifs_info *c)
+{
+ int err;
+ void *buf;
+
+ buf = vmalloc(c->ltab_sz);
+ if (!buf)
+ return -ENOMEM;
+ err = ubi_read(c->ubi, c->ltab_lnum, buf, c->ltab_offs, c->ltab_sz);
+ if (err)
+ goto out;
+ err = unpack_ltab(c, buf);
+out:
+ vfree(buf);
+ return err;
+}
+
+/**
+ * read_lsave - read LPT's save table.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int read_lsave(struct ubifs_info *c)
+{
+ int err, i;
+ void *buf;
+
+ buf = vmalloc(c->lsave_sz);
+ if (!buf)
+ return -ENOMEM;
+ err = ubi_read(c->ubi, c->lsave_lnum, buf, c->lsave_offs, c->lsave_sz);
+ if (err)
+ goto out;
+ err = unpack_lsave(c, buf);
+ if (err)
+ goto out;
+ for (i = 0; i < c->lsave_cnt; i++) {
+ int lnum = c->lsave[i];
+
+ /*
+ * Due to automatic resizing, the values in the lsave table
+ * could be beyond the volume size - just ignore them.
+ */
+ if (lnum >= c->leb_cnt)
+ continue;
+ ubifs_lpt_lookup(c, lnum);
+ }
+out:
+ vfree(buf);
+ return err;
+}
+
+/**
+ * ubifs_get_nnode - get a nnode.
+ * @c: UBIFS file-system description object
+ * @parent: parent nnode (or NULL for the root)
+ * @iip: index in parent
+ *
+ * This function returns a pointer to the nnode on success or a negative error
+ * code on failure.
+ */
+struct ubifs_nnode *ubifs_get_nnode(struct ubifs_info *c,
+ struct ubifs_nnode *parent, int iip)
+{
+ struct ubifs_nbranch *branch;
+ struct ubifs_nnode *nnode;
+ int err;
+
+ branch = &parent->nbranch[iip];
+ nnode = branch->nnode;
+ if (nnode)
+ return nnode;
+ err = ubifs_read_nnode(c, parent, iip);
+ if (err)
+ return ERR_PTR(err);
+ return branch->nnode;
+}
+
+/**
+ * ubifs_get_pnode - get a pnode.
+ * @c: UBIFS file-system description object
+ * @parent: parent nnode
+ * @iip: index in parent
+ *
+ * This function returns a pointer to the pnode on success or a negative error
+ * code on failure.
+ */
+struct ubifs_pnode *ubifs_get_pnode(struct ubifs_info *c,
+ struct ubifs_nnode *parent, int iip)
+{
+ struct ubifs_nbranch *branch;
+ struct ubifs_pnode *pnode;
+ int err;
+
+ branch = &parent->nbranch[iip];
+ pnode = branch->pnode;
+ if (pnode)
+ return pnode;
+ err = read_pnode(c, parent, iip);
+ if (err)
+ return ERR_PTR(err);
+ update_cats(c, branch->pnode);
+ return branch->pnode;
+}
+
+/**
+ * ubifs_lpt_lookup - lookup LEB properties in the LPT.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to lookup
+ *
+ * This function returns a pointer to the LEB properties on success or a
+ * negative error code on failure.
+ */
+struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum)
+{
+ int err, i, h, iip, shft;
+ struct ubifs_nnode *nnode;
+ struct ubifs_pnode *pnode;
+
+ if (!c->nroot) {
+ err = ubifs_read_nnode(c, NULL, 0);
+ if (err)
+ return ERR_PTR(err);
+ }
+ nnode = c->nroot;
+ i = lnum - c->main_first;
+ shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
+ for (h = 1; h < c->lpt_hght; h++) {
+ iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+ shft -= UBIFS_LPT_FANOUT_SHIFT;
+ nnode = ubifs_get_nnode(c, nnode, iip);
+ if (IS_ERR(nnode))
+ return ERR_PTR(PTR_ERR(nnode));
+ }
+ iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+ shft -= UBIFS_LPT_FANOUT_SHIFT;
+ pnode = ubifs_get_pnode(c, nnode, iip);
+ if (IS_ERR(pnode))
+ return ERR_PTR(PTR_ERR(pnode));
+ iip = (i & (UBIFS_LPT_FANOUT - 1));
+ dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum,
+ pnode->lprops[iip].free, pnode->lprops[iip].dirty,
+ pnode->lprops[iip].flags);
+ return &pnode->lprops[iip];
+}
+
+/**
+ * dirty_cow_nnode - ensure a nnode is not being committed.
+ * @c: UBIFS file-system description object
+ * @nnode: nnode to check
+ *
+ * Returns dirtied nnode on success or negative error code on failure.
+ */
+static struct ubifs_nnode *dirty_cow_nnode(struct ubifs_info *c,
+ struct ubifs_nnode *nnode)
+{
+ struct ubifs_nnode *n;
+ int i;
+
+ if (!test_bit(COW_CNODE, &nnode->flags)) {
+ /* nnode is not being committed */
+ if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) {
+ c->dirty_nn_cnt += 1;
+ ubifs_add_nnode_dirt(c, nnode);
+ }
+ return nnode;
+ }
+
+ /* nnode is being committed, so copy it */
+ n = kmalloc(sizeof(struct ubifs_nnode), GFP_NOFS);
+ if (unlikely(!n))
+ return ERR_PTR(-ENOMEM);
+
+ memcpy(n, nnode, sizeof(struct ubifs_nnode));
+ n->cnext = NULL;
+ __set_bit(DIRTY_CNODE, &n->flags);
+ __clear_bit(COW_CNODE, &n->flags);
+
+ /* The children now have new parent */
+ for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+ struct ubifs_nbranch *branch = &n->nbranch[i];
+
+ if (branch->cnode)
+ branch->cnode->parent = n;
+ }
+
+ ubifs_assert(!test_bit(OBSOLETE_CNODE, &nnode->flags));
+ __set_bit(OBSOLETE_CNODE, &nnode->flags);
+
+ c->dirty_nn_cnt += 1;
+ ubifs_add_nnode_dirt(c, nnode);
+ if (nnode->parent)
+ nnode->parent->nbranch[n->iip].nnode = n;
+ else
+ c->nroot = n;
+ return n;
+}
+
+/**
+ * dirty_cow_pnode - ensure a pnode is not being committed.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode to check
+ *
+ * Returns dirtied pnode on success or negative error code on failure.
+ */
+static struct ubifs_pnode *dirty_cow_pnode(struct ubifs_info *c,
+ struct ubifs_pnode *pnode)
+{
+ struct ubifs_pnode *p;
+
+ if (!test_bit(COW_CNODE, &pnode->flags)) {
+ /* pnode is not being committed */
+ if (!test_and_set_bit(DIRTY_CNODE, &pnode->flags)) {
+ c->dirty_pn_cnt += 1;
+ add_pnode_dirt(c, pnode);
+ }
+ return pnode;
+ }
+
+ /* pnode is being committed, so copy it */
+ p = kmalloc(sizeof(struct ubifs_pnode), GFP_NOFS);
+ if (unlikely(!p))
+ return ERR_PTR(-ENOMEM);
+
+ memcpy(p, pnode, sizeof(struct ubifs_pnode));
+ p->cnext = NULL;
+ __set_bit(DIRTY_CNODE, &p->flags);
+ __clear_bit(COW_CNODE, &p->flags);
+ replace_cats(c, pnode, p);
+
+ ubifs_assert(!test_bit(OBSOLETE_CNODE, &pnode->flags));
+ __set_bit(OBSOLETE_CNODE, &pnode->flags);
+
+ c->dirty_pn_cnt += 1;
+ add_pnode_dirt(c, pnode);
+ pnode->parent->nbranch[p->iip].pnode = p;
+ return p;
+}
+
+/**
+ * ubifs_lpt_lookup_dirty - lookup LEB properties in the LPT.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to lookup
+ *
+ * This function returns a pointer to the LEB properties on success or a
+ * negative error code on failure.
+ */
+struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum)
+{
+ int err, i, h, iip, shft;
+ struct ubifs_nnode *nnode;
+ struct ubifs_pnode *pnode;
+
+ if (!c->nroot) {
+ err = ubifs_read_nnode(c, NULL, 0);
+ if (err)
+ return ERR_PTR(err);
+ }
+ nnode = c->nroot;
+ nnode = dirty_cow_nnode(c, nnode);
+ if (IS_ERR(nnode))
+ return ERR_PTR(PTR_ERR(nnode));
+ i = lnum - c->main_first;
+ shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
+ for (h = 1; h < c->lpt_hght; h++) {
+ iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+ shft -= UBIFS_LPT_FANOUT_SHIFT;
+ nnode = ubifs_get_nnode(c, nnode, iip);
+ if (IS_ERR(nnode))
+ return ERR_PTR(PTR_ERR(nnode));
+ nnode = dirty_cow_nnode(c, nnode);
+ if (IS_ERR(nnode))
+ return ERR_PTR(PTR_ERR(nnode));
+ }
+ iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+ shft -= UBIFS_LPT_FANOUT_SHIFT;
+ pnode = ubifs_get_pnode(c, nnode, iip);
+ if (IS_ERR(pnode))
+ return ERR_PTR(PTR_ERR(pnode));
+ pnode = dirty_cow_pnode(c, pnode);
+ if (IS_ERR(pnode))
+ return ERR_PTR(PTR_ERR(pnode));
+ iip = (i & (UBIFS_LPT_FANOUT - 1));
+ dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum,
+ pnode->lprops[iip].free, pnode->lprops[iip].dirty,
+ pnode->lprops[iip].flags);
+ ubifs_assert(test_bit(DIRTY_CNODE, &pnode->flags));
+ return &pnode->lprops[iip];
+}
+
+/**
+ * lpt_init_rd - initialize the LPT for reading.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int lpt_init_rd(struct ubifs_info *c)
+{
+ int err, i;
+
+ c->ltab = vmalloc(sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs);
+ if (!c->ltab)
+ return -ENOMEM;
+
+ i = max_t(int, c->nnode_sz, c->pnode_sz);
+ c->lpt_nod_buf = kmalloc(i, GFP_KERNEL);
+ if (!c->lpt_nod_buf)
+ return -ENOMEM;
+
+ for (i = 0; i < LPROPS_HEAP_CNT; i++) {
+ c->lpt_heap[i].arr = kmalloc(sizeof(void *) * LPT_HEAP_SZ,
+ GFP_KERNEL);
+ if (!c->lpt_heap[i].arr)
+ return -ENOMEM;
+ c->lpt_heap[i].cnt = 0;
+ c->lpt_heap[i].max_cnt = LPT_HEAP_SZ;
+ }
+
+ c->dirty_idx.arr = kmalloc(sizeof(void *) * LPT_HEAP_SZ, GFP_KERNEL);
+ if (!c->dirty_idx.arr)
+ return -ENOMEM;
+ c->dirty_idx.cnt = 0;
+ c->dirty_idx.max_cnt = LPT_HEAP_SZ;
+
+ err = read_ltab(c);
+ if (err)
+ return err;
+
+ dbg_lp("space_bits %d", c->space_bits);
+ dbg_lp("lpt_lnum_bits %d", c->lpt_lnum_bits);
+ dbg_lp("lpt_offs_bits %d", c->lpt_offs_bits);
+ dbg_lp("lpt_spc_bits %d", c->lpt_spc_bits);
+ dbg_lp("pcnt_bits %d", c->pcnt_bits);
+ dbg_lp("lnum_bits %d", c->lnum_bits);
+ dbg_lp("pnode_sz %d", c->pnode_sz);
+ dbg_lp("nnode_sz %d", c->nnode_sz);
+ dbg_lp("ltab_sz %d", c->ltab_sz);
+ dbg_lp("lsave_sz %d", c->lsave_sz);
+ dbg_lp("lsave_cnt %d", c->lsave_cnt);
+ dbg_lp("lpt_hght %d", c->lpt_hght);
+ dbg_lp("big_lpt %d", c->big_lpt);
+ dbg_lp("LPT root is at %d:%d", c->lpt_lnum, c->lpt_offs);
+ dbg_lp("LPT head is at %d:%d", c->nhead_lnum, c->nhead_offs);
+ dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs);
+ if (c->big_lpt)
+ dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs);
+
+ return 0;
+}
+
+/**
+ * lpt_init_wr - initialize the LPT for writing.
+ * @c: UBIFS file-system description object
+ *
+ * 'lpt_init_rd()' must have been called already.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int lpt_init_wr(struct ubifs_info *c)
+{
+ int err, i;
+
+ c->ltab_cmt = vmalloc(sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs);
+ if (!c->ltab_cmt)
+ return -ENOMEM;
+
+ c->lpt_buf = vmalloc(c->leb_size);
+ if (!c->lpt_buf)
+ return -ENOMEM;
+
+ if (c->big_lpt) {
+ c->lsave = kmalloc(sizeof(int) * c->lsave_cnt, GFP_NOFS);
+ if (!c->lsave)
+ return -ENOMEM;
+ err = read_lsave(c);
+ if (err)
+ return err;
+ }
+
+ for (i = 0; i < c->lpt_lebs; i++)
+ if (c->ltab[i].free == c->leb_size) {
+ err = ubifs_leb_unmap(c, i + c->lpt_first);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+/**
+ * ubifs_lpt_init - initialize the LPT.
+ * @c: UBIFS file-system description object
+ * @rd: whether to initialize lpt for reading
+ * @wr: whether to initialize lpt for writing
+ *
+ * For mounting 'rw', @rd and @wr are both true. For mounting 'ro', @rd is true
+ * and @wr is false. For mounting from 'ro' to 'rw', @rd is false and @wr is
+ * true.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_lpt_init(struct ubifs_info *c, int rd, int wr)
+{
+ int err;
+
+ if (rd) {
+ err = lpt_init_rd(c);
+ if (err)
+ return err;
+ }
+
+ if (wr) {
+ err = lpt_init_wr(c);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+/**
+ * struct lpt_scan_node - somewhere to put nodes while we scan LPT.
+ * @nnode: where to keep a nnode
+ * @pnode: where to keep a pnode
+ * @cnode: where to keep a cnode
+ * @in_tree: is the node in the tree in memory
+ * @ptr.nnode: pointer to the nnode (if it is an nnode) which may be here or in
+ * the tree
+ * @ptr.pnode: ditto for pnode
+ * @ptr.cnode: ditto for cnode
+ */
+struct lpt_scan_node {
+ union {
+ struct ubifs_nnode nnode;
+ struct ubifs_pnode pnode;
+ struct ubifs_cnode cnode;
+ };
+ int in_tree;
+ union {
+ struct ubifs_nnode *nnode;
+ struct ubifs_pnode *pnode;
+ struct ubifs_cnode *cnode;
+ } ptr;
+};
+
+/**
+ * scan_get_nnode - for the scan, get a nnode from either the tree or flash.
+ * @c: the UBIFS file-system description object
+ * @path: where to put the nnode
+ * @parent: parent of the nnode
+ * @iip: index in parent of the nnode
+ *
+ * This function returns a pointer to the nnode on success or a negative error
+ * code on failure.
+ */
+static struct ubifs_nnode *scan_get_nnode(struct ubifs_info *c,
+ struct lpt_scan_node *path,
+ struct ubifs_nnode *parent, int iip)
+{
+ struct ubifs_nbranch *branch;
+ struct ubifs_nnode *nnode;
+ void *buf = c->lpt_nod_buf;
+ int err;
+
+ branch = &parent->nbranch[iip];
+ nnode = branch->nnode;
+ if (nnode) {
+ path->in_tree = 1;
+ path->ptr.nnode = nnode;
+ return nnode;
+ }
+ nnode = &path->nnode;
+ path->in_tree = 0;
+ path->ptr.nnode = nnode;
+ memset(nnode, 0, sizeof(struct ubifs_nnode));
+ if (branch->lnum == 0) {
+ /*
+ * This nnode was not written which just means that the LEB
+ * properties in the subtree below it describe empty LEBs. We
+ * make the nnode as though we had read it, which in fact means
+ * doing almost nothing.
+ */
+ if (c->big_lpt)
+ nnode->num = calc_nnode_num_from_parent(c, parent, iip);
+ } else {
+ err = ubi_read(c->ubi, branch->lnum, buf, branch->offs,
+ c->nnode_sz);
+ if (err)
+ return ERR_PTR(err);
+ err = unpack_nnode(c, buf, nnode);
+ if (err)
+ return ERR_PTR(err);
+ }
+ err = validate_nnode(c, nnode, parent, iip);
+ if (err)
+ return ERR_PTR(err);
+ if (!c->big_lpt)
+ nnode->num = calc_nnode_num_from_parent(c, parent, iip);
+ nnode->level = parent->level - 1;
+ nnode->parent = parent;
+ nnode->iip = iip;
+ return nnode;
+}
+
+/**
+ * scan_get_pnode - for the scan, get a pnode from either the tree or flash.
+ * @c: the UBIFS file-system description object
+ * @path: where to put the pnode
+ * @parent: parent of the pnode
+ * @iip: index in parent of the pnode
+ *
+ * This function returns a pointer to the pnode on success or a negative error
+ * code on failure.
+ */
+static struct ubifs_pnode *scan_get_pnode(struct ubifs_info *c,
+ struct lpt_scan_node *path,
+ struct ubifs_nnode *parent, int iip)
+{
+ struct ubifs_nbranch *branch;
+ struct ubifs_pnode *pnode;
+ void *buf = c->lpt_nod_buf;
+ int err;
+
+ branch = &parent->nbranch[iip];
+ pnode = branch->pnode;
+ if (pnode) {
+ path->in_tree = 1;
+ path->ptr.pnode = pnode;
+ return pnode;
+ }
+ pnode = &path->pnode;
+ path->in_tree = 0;
+ path->ptr.pnode = pnode;
+ memset(pnode, 0, sizeof(struct ubifs_pnode));
+ if (branch->lnum == 0) {
+ /*
+ * This pnode was not written which just means that the LEB
+ * properties in it describe empty LEBs. We make the pnode as
+ * though we had read it.
+ */
+ int i;
+
+ if (c->big_lpt)
+ pnode->num = calc_pnode_num_from_parent(c, parent, iip);
+ for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+ struct ubifs_lprops * const lprops = &pnode->lprops[i];
+
+ lprops->free = c->leb_size;
+ lprops->flags = ubifs_categorize_lprops(c, lprops);
+ }
+ } else {
+ ubifs_assert(branch->lnum >= c->lpt_first &&
+ branch->lnum <= c->lpt_last);
+ ubifs_assert(branch->offs >= 0 && branch->offs < c->leb_size);
+ err = ubi_read(c->ubi, branch->lnum, buf, branch->offs,
+ c->pnode_sz);
+ if (err)
+ return ERR_PTR(err);
+ err = unpack_pnode(c, buf, pnode);
+ if (err)
+ return ERR_PTR(err);
+ }
+ err = validate_pnode(c, pnode, parent, iip);
+ if (err)
+ return ERR_PTR(err);
+ if (!c->big_lpt)
+ pnode->num = calc_pnode_num_from_parent(c, parent, iip);
+ pnode->parent = parent;
+ pnode->iip = iip;
+ set_pnode_lnum(c, pnode);
+ return pnode;
+}
+
+/**
+ * ubifs_lpt_scan_nolock - scan the LPT.
+ * @c: the UBIFS file-system description object
+ * @start_lnum: LEB number from which to start scanning
+ * @end_lnum: LEB number at which to stop scanning
+ * @scan_cb: callback function called for each lprops
+ * @data: data to be passed to the callback function
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_lpt_scan_nolock(struct ubifs_info *c, int start_lnum, int end_lnum,
+ ubifs_lpt_scan_callback scan_cb, void *data)
+{
+ int err = 0, i, h, iip, shft;
+ struct ubifs_nnode *nnode;
+ struct ubifs_pnode *pnode;
+ struct lpt_scan_node *path;
+
+ if (start_lnum == -1) {
+ start_lnum = end_lnum + 1;
+ if (start_lnum >= c->leb_cnt)
+ start_lnum = c->main_first;
+ }
+
+ ubifs_assert(start_lnum >= c->main_first && start_lnum < c->leb_cnt);
+ ubifs_assert(end_lnum >= c->main_first && end_lnum < c->leb_cnt);
+
+ if (!c->nroot) {
+ err = ubifs_read_nnode(c, NULL, 0);
+ if (err)
+ return err;
+ }
+
+ path = kmalloc(sizeof(struct lpt_scan_node) * (c->lpt_hght + 1),
+ GFP_NOFS);
+ if (!path)
+ return -ENOMEM;
+
+ path[0].ptr.nnode = c->nroot;
+ path[0].in_tree = 1;
+again:
+ /* Descend to the pnode containing start_lnum */
+ nnode = c->nroot;
+ i = start_lnum - c->main_first;
+ shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
+ for (h = 1; h < c->lpt_hght; h++) {
+ iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+ shft -= UBIFS_LPT_FANOUT_SHIFT;
+ nnode = scan_get_nnode(c, path + h, nnode, iip);
+ if (IS_ERR(nnode)) {
+ err = PTR_ERR(nnode);
+ goto out;
+ }
+ }
+ iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+ shft -= UBIFS_LPT_FANOUT_SHIFT;
+ pnode = scan_get_pnode(c, path + h, nnode, iip);
+ if (IS_ERR(pnode)) {
+ err = PTR_ERR(pnode);
+ goto out;
+ }
+ iip = (i & (UBIFS_LPT_FANOUT - 1));
+
+ /* Loop for each lprops */
+ while (1) {
+ struct ubifs_lprops *lprops = &pnode->lprops[iip];
+ int ret, lnum = lprops->lnum;
+
+ ret = scan_cb(c, lprops, path[h].in_tree, data);
+ if (ret < 0) {
+ err = ret;
+ goto out;
+ }
+ if (ret & LPT_SCAN_ADD) {
+ /* Add all the nodes in path to the tree in memory */
+ for (h = 1; h < c->lpt_hght; h++) {
+ const size_t sz = sizeof(struct ubifs_nnode);
+ struct ubifs_nnode *parent;
+
+ if (path[h].in_tree)
+ continue;
+ nnode = kmalloc(sz, GFP_NOFS);
+ if (!nnode) {
+ err = -ENOMEM;
+ goto out;
+ }
+ memcpy(nnode, &path[h].nnode, sz);
+ parent = nnode->parent;
+ parent->nbranch[nnode->iip].nnode = nnode;
+ path[h].ptr.nnode = nnode;
+ path[h].in_tree = 1;
+ path[h + 1].cnode.parent = nnode;
+ }
+ if (path[h].in_tree)
+ ubifs_ensure_cat(c, lprops);
+ else {
+ const size_t sz = sizeof(struct ubifs_pnode);
+ struct ubifs_nnode *parent;
+
+ pnode = kmalloc(sz, GFP_NOFS);
+ if (!pnode) {
+ err = -ENOMEM;
+ goto out;
+ }
+ memcpy(pnode, &path[h].pnode, sz);
+ parent = pnode->parent;
+ parent->nbranch[pnode->iip].pnode = pnode;
+ path[h].ptr.pnode = pnode;
+ path[h].in_tree = 1;
+ update_cats(c, pnode);
+ c->pnodes_have += 1;
+ }
+ err = dbg_check_lpt_nodes(c, (struct ubifs_cnode *)
+ c->nroot, 0, 0);
+ if (err)
+ goto out;
+ err = dbg_check_cats(c);
+ if (err)
+ goto out;
+ }
+ if (ret & LPT_SCAN_STOP) {
+ err = 0;
+ break;
+ }
+ /* Get the next lprops */
+ if (lnum == end_lnum) {
+ /*
+ * We got to the end without finding what we were
+ * looking for
+ */
+ err = -ENOSPC;
+ goto out;
+ }
+ if (lnum + 1 >= c->leb_cnt) {
+ /* Wrap-around to the beginning */
+ start_lnum = c->main_first;
+ goto again;
+ }
+ if (iip + 1 < UBIFS_LPT_FANOUT) {
+ /* Next lprops is in the same pnode */
+ iip += 1;
+ continue;
+ }
+ /* We need to get the next pnode. Go up until we can go right */
+ iip = pnode->iip;
+ while (1) {
+ h -= 1;
+ ubifs_assert(h >= 0);
+ nnode = path[h].ptr.nnode;
+ if (iip + 1 < UBIFS_LPT_FANOUT)
+ break;
+ iip = nnode->iip;
+ }
+ /* Go right */
+ iip += 1;
+ /* Descend to the pnode */
+ h += 1;
+ for (; h < c->lpt_hght; h++) {
+ nnode = scan_get_nnode(c, path + h, nnode, iip);
+ if (IS_ERR(nnode)) {
+ err = PTR_ERR(nnode);
+ goto out;
+ }
+ iip = 0;
+ }
+ pnode = scan_get_pnode(c, path + h, nnode, iip);
+ if (IS_ERR(pnode)) {
+ err = PTR_ERR(pnode);
+ goto out;
+ }
+ iip = 0;
+ }
+out:
+ kfree(path);
+ return err;
+}
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+
+/**
+ * dbg_chk_pnode - check a pnode.
+ * @c: the UBIFS file-system description object
+ * @pnode: pnode to check
+ * @col: pnode column
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
+ int col)
+{
+ int i;
+
+ if (pnode->num != col) {
+ dbg_err("pnode num %d expected %d parent num %d iip %d",
+ pnode->num, col, pnode->parent->num, pnode->iip);
+ return -EINVAL;
+ }
+ for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+ struct ubifs_lprops *lp, *lprops = &pnode->lprops[i];
+ int lnum = (pnode->num << UBIFS_LPT_FANOUT_SHIFT) + i +
+ c->main_first;
+ int found, cat = lprops->flags & LPROPS_CAT_MASK;
+ struct ubifs_lpt_heap *heap;
+ struct list_head *list = NULL;
+
+ if (lnum >= c->leb_cnt)
+ continue;
+ if (lprops->lnum != lnum) {
+ dbg_err("bad LEB number %d expected %d",
+ lprops->lnum, lnum);
+ return -EINVAL;
+ }
+ if (lprops->flags & LPROPS_TAKEN) {
+ if (cat != LPROPS_UNCAT) {
+ dbg_err("LEB %d taken but not uncat %d",
+ lprops->lnum, cat);
+ return -EINVAL;
+ }
+ continue;
+ }
+ if (lprops->flags & LPROPS_INDEX) {
+ switch (cat) {
+ case LPROPS_UNCAT:
+ case LPROPS_DIRTY_IDX:
+ case LPROPS_FRDI_IDX:
+ break;
+ default:
+ dbg_err("LEB %d index but cat %d",
+ lprops->lnum, cat);
+ return -EINVAL;
+ }
+ } else {
+ switch (cat) {
+ case LPROPS_UNCAT:
+ case LPROPS_DIRTY:
+ case LPROPS_FREE:
+ case LPROPS_EMPTY:
+ case LPROPS_FREEABLE:
+ break;
+ default:
+ dbg_err("LEB %d not index but cat %d",
+ lprops->lnum, cat);
+ return -EINVAL;
+ }
+ }
+ switch (cat) {
+ case LPROPS_UNCAT:
+ list = &c->uncat_list;
+ break;
+ case LPROPS_EMPTY:
+ list = &c->empty_list;
+ break;
+ case LPROPS_FREEABLE:
+ list = &c->freeable_list;
+ break;
+ case LPROPS_FRDI_IDX:
+ list = &c->frdi_idx_list;
+ break;
+ }
+ found = 0;
+ switch (cat) {
+ case LPROPS_DIRTY:
+ case LPROPS_DIRTY_IDX:
+ case LPROPS_FREE:
+ heap = &c->lpt_heap[cat - 1];
+ if (lprops->hpos < heap->cnt &&
+ heap->arr[lprops->hpos] == lprops)
+ found = 1;
+ break;
+ case LPROPS_UNCAT:
+ case LPROPS_EMPTY:
+ case LPROPS_FREEABLE:
+ case LPROPS_FRDI_IDX:
+ list_for_each_entry(lp, list, list)
+ if (lprops == lp) {
+ found = 1;
+ break;
+ }
+ break;
+ }
+ if (!found) {
+ dbg_err("LEB %d cat %d not found in cat heap/list",
+ lprops->lnum, cat);
+ return -EINVAL;
+ }
+ switch (cat) {
+ case LPROPS_EMPTY:
+ if (lprops->free != c->leb_size) {
+ dbg_err("LEB %d cat %d free %d dirty %d",
+ lprops->lnum, cat, lprops->free,
+ lprops->dirty);
+ return -EINVAL;
+ }
+ case LPROPS_FREEABLE:
+ case LPROPS_FRDI_IDX:
+ if (lprops->free + lprops->dirty != c->leb_size) {
+ dbg_err("LEB %d cat %d free %d dirty %d",
+ lprops->lnum, cat, lprops->free,
+ lprops->dirty);
+ return -EINVAL;
+ }
+ }
+ }
+ return 0;
+}
+
+/**
+ * dbg_check_lpt_nodes - check nnodes and pnodes.
+ * @c: the UBIFS file-system description object
+ * @cnode: next cnode (nnode or pnode) to check
+ * @row: row of cnode (root is zero)
+ * @col: column of cnode (leftmost is zero)
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
+ int row, int col)
+{
+ struct ubifs_nnode *nnode, *nn;
+ struct ubifs_cnode *cn;
+ int num, iip = 0, err;
+
+ if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
+ return 0;
+
+ while (cnode) {
+ ubifs_assert(row >= 0);
+ nnode = cnode->parent;
+ if (cnode->level) {
+ /* cnode is a nnode */
+ num = calc_nnode_num(row, col);
+ if (cnode->num != num) {
+ dbg_err("nnode num %d expected %d "
+ "parent num %d iip %d", cnode->num, num,
+ (nnode ? nnode->num : 0), cnode->iip);
+ return -EINVAL;
+ }
+ nn = (struct ubifs_nnode *)cnode;
+ while (iip < UBIFS_LPT_FANOUT) {
+ cn = nn->nbranch[iip].cnode;
+ if (cn) {
+ /* Go down */
+ row += 1;
+ col <<= UBIFS_LPT_FANOUT_SHIFT;
+ col += iip;
+ iip = 0;
+ cnode = cn;
+ break;
+ }
+ /* Go right */
+ iip += 1;
+ }
+ if (iip < UBIFS_LPT_FANOUT)
+ continue;
+ } else {
+ struct ubifs_pnode *pnode;
+
+ /* cnode is a pnode */
+ pnode = (struct ubifs_pnode *)cnode;
+ err = dbg_chk_pnode(c, pnode, col);
+ if (err)
+ return err;
+ }
+ /* Go up and to the right */
+ row -= 1;
+ col >>= UBIFS_LPT_FANOUT_SHIFT;
+ iip = cnode->iip + 1;
+ cnode = (struct ubifs_cnode *)nnode;
+ }
+ return 0;
+}
+
+#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
new file mode 100644
index 00000000000..5f0b83e20af
--- /dev/null
+++ b/fs/ubifs/lpt_commit.c
@@ -0,0 +1,1648 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ * Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements commit-related functionality of the LEB properties
+ * subsystem.
+ */
+
+#include <linux/crc16.h>
+#include "ubifs.h"
+
+/**
+ * first_dirty_cnode - find first dirty cnode.
+ * @c: UBIFS file-system description object
+ * @nnode: nnode at which to start
+ *
+ * This function returns the first dirty cnode or %NULL if there is not one.
+ */
+static struct ubifs_cnode *first_dirty_cnode(struct ubifs_nnode *nnode)
+{
+ ubifs_assert(nnode);
+ while (1) {
+ int i, cont = 0;
+
+ for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+ struct ubifs_cnode *cnode;
+
+ cnode = nnode->nbranch[i].cnode;
+ if (cnode &&
+ test_bit(DIRTY_CNODE, &cnode->flags)) {
+ if (cnode->level == 0)
+ return cnode;
+ nnode = (struct ubifs_nnode *)cnode;
+ cont = 1;
+ break;
+ }
+ }
+ if (!cont)
+ return (struct ubifs_cnode *)nnode;
+ }
+}
+
+/**
+ * next_dirty_cnode - find next dirty cnode.
+ * @cnode: cnode from which to begin searching
+ *
+ * This function returns the next dirty cnode or %NULL if there is not one.
+ */
+static struct ubifs_cnode *next_dirty_cnode(struct ubifs_cnode *cnode)
+{
+ struct ubifs_nnode *nnode;
+ int i;
+
+ ubifs_assert(cnode);
+ nnode = cnode->parent;
+ if (!nnode)
+ return NULL;
+ for (i = cnode->iip + 1; i < UBIFS_LPT_FANOUT; i++) {
+ cnode = nnode->nbranch[i].cnode;
+ if (cnode && test_bit(DIRTY_CNODE, &cnode->flags)) {
+ if (cnode->level == 0)
+ return cnode; /* cnode is a pnode */
+ /* cnode is a nnode */
+ return first_dirty_cnode((struct ubifs_nnode *)cnode);
+ }
+ }
+ return (struct ubifs_cnode *)nnode;
+}
+
+/**
+ * get_cnodes_to_commit - create list of dirty cnodes to commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns the number of cnodes to commit.
+ */
+static int get_cnodes_to_commit(struct ubifs_info *c)
+{
+ struct ubifs_cnode *cnode, *cnext;
+ int cnt = 0;
+
+ if (!c->nroot)
+ return 0;
+
+ if (!test_bit(DIRTY_CNODE, &c->nroot->flags))
+ return 0;
+
+ c->lpt_cnext = first_dirty_cnode(c->nroot);
+ cnode = c->lpt_cnext;
+ if (!cnode)
+ return 0;
+ cnt += 1;
+ while (1) {
+ ubifs_assert(!test_bit(COW_ZNODE, &cnode->flags));
+ __set_bit(COW_ZNODE, &cnode->flags);
+ cnext = next_dirty_cnode(cnode);
+ if (!cnext) {
+ cnode->cnext = c->lpt_cnext;
+ break;
+ }
+ cnode->cnext = cnext;
+ cnode = cnext;
+ cnt += 1;
+ }
+ dbg_cmt("committing %d cnodes", cnt);
+ dbg_lp("committing %d cnodes", cnt);
+ ubifs_assert(cnt == c->dirty_nn_cnt + c->dirty_pn_cnt);
+ return cnt;
+}
+
+/**
+ * upd_ltab - update LPT LEB properties.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number
+ * @free: amount of free space
+ * @dirty: amount of dirty space to add
+ */
+static void upd_ltab(struct ubifs_info *c, int lnum, int free, int dirty)
+{
+ dbg_lp("LEB %d free %d dirty %d to %d +%d",
+ lnum, c->ltab[lnum - c->lpt_first].free,
+ c->ltab[lnum - c->lpt_first].dirty, free, dirty);
+ ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last);
+ c->ltab[lnum - c->lpt_first].free = free;
+ c->ltab[lnum - c->lpt_first].dirty += dirty;
+}
+
+/**
+ * alloc_lpt_leb - allocate an LPT LEB that is empty.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number is passed and returned here
+ *
+ * This function finds the next empty LEB in the ltab starting from @lnum. If a
+ * an empty LEB is found it is returned in @lnum and the function returns %0.
+ * Otherwise the function returns -ENOSPC. Note however, that LPT is designed
+ * never to run out of space.
+ */
+static int alloc_lpt_leb(struct ubifs_info *c, int *lnum)
+{
+ int i, n;
+
+ n = *lnum - c->lpt_first + 1;
+ for (i = n; i < c->lpt_lebs; i++) {
+ if (c->ltab[i].tgc || c->ltab[i].cmt)
+ continue;
+ if (c->ltab[i].free == c->leb_size) {
+ c->ltab[i].cmt = 1;
+ *lnum = i + c->lpt_first;
+ return 0;
+ }
+ }
+
+ for (i = 0; i < n; i++) {
+ if (c->ltab[i].tgc || c->ltab[i].cmt)
+ continue;
+ if (c->ltab[i].free == c->leb_size) {
+ c->ltab[i].cmt = 1;
+ *lnum = i + c->lpt_first;
+ return 0;
+ }
+ }
+ dbg_err("last LEB %d", *lnum);
+ dump_stack();
+ return -ENOSPC;
+}
+
+/**
+ * layout_cnodes - layout cnodes for commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int layout_cnodes(struct ubifs_info *c)
+{
+ int lnum, offs, len, alen, done_lsave, done_ltab, err;
+ struct ubifs_cnode *cnode;
+
+ cnode = c->lpt_cnext;
+ if (!cnode)
+ return 0;
+ lnum = c->nhead_lnum;
+ offs = c->nhead_offs;
+ /* Try to place lsave and ltab nicely */
+ done_lsave = !c->big_lpt;
+ done_ltab = 0;
+ if (!done_lsave && offs + c->lsave_sz <= c->leb_size) {
+ done_lsave = 1;
+ c->lsave_lnum = lnum;
+ c->lsave_offs = offs;
+ offs += c->lsave_sz;
+ }
+
+ if (offs + c->ltab_sz <= c->leb_size) {
+ done_ltab = 1;
+ c->ltab_lnum = lnum;
+ c->ltab_offs = offs;
+ offs += c->ltab_sz;
+ }
+
+ do {
+ if (cnode->level) {
+ len = c->nnode_sz;
+ c->dirty_nn_cnt -= 1;
+ } else {
+ len = c->pnode_sz;
+ c->dirty_pn_cnt -= 1;
+ }
+ while (offs + len > c->leb_size) {
+ alen = ALIGN(offs, c->min_io_size);
+ upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
+ err = alloc_lpt_leb(c, &lnum);
+ if (err)
+ return err;
+ offs = 0;
+ ubifs_assert(lnum >= c->lpt_first &&
+ lnum <= c->lpt_last);
+ /* Try to place lsave and ltab nicely */
+ if (!done_lsave) {
+ done_lsave = 1;
+ c->lsave_lnum = lnum;
+ c->lsave_offs = offs;
+ offs += c->lsave_sz;
+ continue;
+ }
+ if (!done_ltab) {
+ done_ltab = 1;
+ c->ltab_lnum = lnum;
+ c->ltab_offs = offs;
+ offs += c->ltab_sz;
+ continue;
+ }
+ break;
+ }
+ if (cnode->parent) {
+ cnode->parent->nbranch[cnode->iip].lnum = lnum;
+ cnode->parent->nbranch[cnode->iip].offs = offs;
+ } else {
+ c->lpt_lnum = lnum;
+ c->lpt_offs = offs;
+ }
+ offs += len;
+ cnode = cnode->cnext;
+ } while (cnode && cnode != c->lpt_cnext);
+
+ /* Make sure to place LPT's save table */
+ if (!done_lsave) {
+ if (offs + c->lsave_sz > c->leb_size) {
+ alen = ALIGN(offs, c->min_io_size);
+ upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
+ err = alloc_lpt_leb(c, &lnum);
+ if (err)
+ return err;
+ offs = 0;
+ ubifs_assert(lnum >= c->lpt_first &&
+ lnum <= c->lpt_last);
+ }
+ done_lsave = 1;
+ c->lsave_lnum = lnum;
+ c->lsave_offs = offs;
+ offs += c->lsave_sz;
+ }
+
+ /* Make sure to place LPT's own lprops table */
+ if (!done_ltab) {
+ if (offs + c->ltab_sz > c->leb_size) {
+ alen = ALIGN(offs, c->min_io_size);
+ upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
+ err = alloc_lpt_leb(c, &lnum);
+ if (err)
+ return err;
+ offs = 0;
+ ubifs_assert(lnum >= c->lpt_first &&
+ lnum <= c->lpt_last);
+ }
+ done_ltab = 1;
+ c->ltab_lnum = lnum;
+ c->ltab_offs = offs;
+ offs += c->ltab_sz;
+ }
+
+ alen = ALIGN(offs, c->min_io_size);
+ upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
+ return 0;
+}
+
+/**
+ * realloc_lpt_leb - allocate an LPT LEB that is empty.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number is passed and returned here
+ *
+ * This function duplicates exactly the results of the function alloc_lpt_leb.
+ * It is used during end commit to reallocate the same LEB numbers that were
+ * allocated by alloc_lpt_leb during start commit.
+ *
+ * This function finds the next LEB that was allocated by the alloc_lpt_leb
+ * function starting from @lnum. If a LEB is found it is returned in @lnum and
+ * the function returns %0. Otherwise the function returns -ENOSPC.
+ * Note however, that LPT is designed never to run out of space.
+ */
+static int realloc_lpt_leb(struct ubifs_info *c, int *lnum)
+{
+ int i, n;
+
+ n = *lnum - c->lpt_first + 1;
+ for (i = n; i < c->lpt_lebs; i++)
+ if (c->ltab[i].cmt) {
+ c->ltab[i].cmt = 0;
+ *lnum = i + c->lpt_first;
+ return 0;
+ }
+
+ for (i = 0; i < n; i++)
+ if (c->ltab[i].cmt) {
+ c->ltab[i].cmt = 0;
+ *lnum = i + c->lpt_first;
+ return 0;
+ }
+ dbg_err("last LEB %d", *lnum);
+ dump_stack();
+ return -ENOSPC;
+}
+
+/**
+ * write_cnodes - write cnodes for commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int write_cnodes(struct ubifs_info *c)
+{
+ int lnum, offs, len, from, err, wlen, alen, done_ltab, done_lsave;
+ struct ubifs_cnode *cnode;
+ void *buf = c->lpt_buf;
+
+ cnode = c->lpt_cnext;
+ if (!cnode)
+ return 0;
+ lnum = c->nhead_lnum;
+ offs = c->nhead_offs;
+ from = offs;
+ /* Ensure empty LEB is unmapped */
+ if (offs == 0) {
+ err = ubifs_leb_unmap(c, lnum);
+ if (err)
+ return err;
+ }
+ /* Try to place lsave and ltab nicely */
+ done_lsave = !c->big_lpt;
+ done_ltab = 0;
+ if (!done_lsave && offs + c->lsave_sz <= c->leb_size) {
+ done_lsave = 1;
+ ubifs_pack_lsave(c, buf + offs, c->lsave);
+ offs += c->lsave_sz;
+ }
+
+ if (offs + c->ltab_sz <= c->leb_size) {
+ done_ltab = 1;
+ ubifs_pack_ltab(c, buf + offs, c->ltab_cmt);
+ offs += c->ltab_sz;
+ }
+
+ /* Loop for each cnode */
+ do {
+ if (cnode->level)
+ len = c->nnode_sz;
+ else
+ len = c->pnode_sz;
+ while (offs + len > c->leb_size) {
+ wlen = offs - from;
+ if (wlen) {
+ alen = ALIGN(wlen, c->min_io_size);
+ memset(buf + offs, 0xff, alen - wlen);
+ err = ubifs_leb_write(c, lnum, buf + from, from,
+ alen, UBI_SHORTTERM);
+ if (err)
+ return err;
+ }
+ err = realloc_lpt_leb(c, &lnum);
+ if (err)
+ return err;
+ offs = 0;
+ from = 0;
+ ubifs_assert(lnum >= c->lpt_first &&
+ lnum <= c->lpt_last);
+ err = ubifs_leb_unmap(c, lnum);
+ if (err)
+ return err;
+ /* Try to place lsave and ltab nicely */
+ if (!done_lsave) {
+ done_lsave = 1;
+ ubifs_pack_lsave(c, buf + offs, c->lsave);
+ offs += c->lsave_sz;
+ continue;
+ }
+ if (!done_ltab) {
+ done_ltab = 1;
+ ubifs_pack_ltab(c, buf + offs, c->ltab_cmt);
+ offs += c->ltab_sz;
+ continue;
+ }
+ break;
+ }
+ if (cnode->level)
+ ubifs_pack_nnode(c, buf + offs,
+ (struct ubifs_nnode *)cnode);
+ else
+ ubifs_pack_pnode(c, buf + offs,
+ (struct ubifs_pnode *)cnode);
+ /*
+ * The reason for the barriers is the same as in case of TNC.
+ * See comment in 'write_index()'. 'dirty_cow_nnode()' and
+ * 'dirty_cow_pnode()' are the functions for which this is
+ * important.
+ */
+ clear_bit(DIRTY_CNODE, &cnode->flags);
+ smp_mb__before_clear_bit();
+ clear_bit(COW_ZNODE, &cnode->flags);
+ smp_mb__after_clear_bit();
+ offs += len;
+ cnode = cnode->cnext;
+ } while (cnode && cnode != c->lpt_cnext);
+
+ /* Make sure to place LPT's save table */
+ if (!done_lsave) {
+ if (offs + c->lsave_sz > c->leb_size) {
+ wlen = offs - from;
+ alen = ALIGN(wlen, c->min_io_size);
+ memset(buf + offs, 0xff, alen - wlen);
+ err = ubifs_leb_write(c, lnum, buf + from, from, alen,
+ UBI_SHORTTERM);
+ if (err)
+ return err;
+ err = realloc_lpt_leb(c, &lnum);
+ if (err)
+ return err;
+ offs = 0;
+ ubifs_assert(lnum >= c->lpt_first &&
+ lnum <= c->lpt_last);
+ err = ubifs_leb_unmap(c, lnum);
+ if (err)
+ return err;
+ }
+ done_lsave = 1;
+ ubifs_pack_lsave(c, buf + offs, c->lsave);
+ offs += c->lsave_sz;
+ }
+
+ /* Make sure to place LPT's own lprops table */
+ if (!done_ltab) {
+ if (offs + c->ltab_sz > c->leb_size) {
+ wlen = offs - from;
+ alen = ALIGN(wlen, c->min_io_size);
+ memset(buf + offs, 0xff, alen - wlen);
+ err = ubifs_leb_write(c, lnum, buf + from, from, alen,
+ UBI_SHORTTERM);
+ if (err)
+ return err;
+ err = realloc_lpt_leb(c, &lnum);
+ if (err)
+ return err;
+ offs = 0;
+ ubifs_assert(lnum >= c->lpt_first &&
+ lnum <= c->lpt_last);
+ err = ubifs_leb_unmap(c, lnum);
+ if (err)
+ return err;
+ }
+ done_ltab = 1;
+ ubifs_pack_ltab(c, buf + offs, c->ltab_cmt);
+ offs += c->ltab_sz;
+ }
+
+ /* Write remaining data in buffer */
+ wlen = offs - from;
+ alen = ALIGN(wlen, c->min_io_size);
+ memset(buf + offs, 0xff, alen - wlen);
+ err = ubifs_leb_write(c, lnum, buf + from, from, alen, UBI_SHORTTERM);
+ if (err)
+ return err;
+ c->nhead_lnum = lnum;
+ c->nhead_offs = ALIGN(offs, c->min_io_size);
+
+ dbg_lp("LPT root is at %d:%d", c->lpt_lnum, c->lpt_offs);
+ dbg_lp("LPT head is at %d:%d", c->nhead_lnum, c->nhead_offs);
+ dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs);
+ if (c->big_lpt)
+ dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs);
+ return 0;
+}
+
+/**
+ * next_pnode - find next pnode.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode
+ *
+ * This function returns the next pnode or %NULL if there are no more pnodes.
+ */
+static struct ubifs_pnode *next_pnode(struct ubifs_info *c,
+ struct ubifs_pnode *pnode)
+{
+ struct ubifs_nnode *nnode;
+ int iip;
+
+ /* Try to go right */
+ nnode = pnode->parent;
+ iip = pnode->iip + 1;
+ if (iip < UBIFS_LPT_FANOUT) {
+ /* We assume here that LEB zero is never an LPT LEB */
+ if (nnode->nbranch[iip].lnum)
+ return ubifs_get_pnode(c, nnode, iip);
+ else
+ return NULL;
+ }
+
+ /* Go up while can't go right */
+ do {
+ iip = nnode->iip + 1;
+ nnode = nnode->parent;
+ if (!nnode)
+ return NULL;
+ /* We assume here that LEB zero is never an LPT LEB */
+ } while (iip >= UBIFS_LPT_FANOUT || !nnode->nbranch[iip].lnum);
+
+ /* Go right */
+ nnode = ubifs_get_nnode(c, nnode, iip);
+ if (IS_ERR(nnode))
+ return (void *)nnode;
+
+ /* Go down to level 1 */
+ while (nnode->level > 1) {
+ nnode = ubifs_get_nnode(c, nnode, 0);
+ if (IS_ERR(nnode))
+ return (void *)nnode;
+ }
+
+ return ubifs_get_pnode(c, nnode, 0);
+}
+
+/**
+ * pnode_lookup - lookup a pnode in the LPT.
+ * @c: UBIFS file-system description object
+ * @i: pnode number (0 to main_lebs - 1)
+ *
+ * This function returns a pointer to the pnode on success or a negative
+ * error code on failure.
+ */
+static struct ubifs_pnode *pnode_lookup(struct ubifs_info *c, int i)
+{
+ int err, h, iip, shft;
+ struct ubifs_nnode *nnode;
+
+ if (!c->nroot) {
+ err = ubifs_read_nnode(c, NULL, 0);
+ if (err)
+ return ERR_PTR(err);
+ }
+ i <<= UBIFS_LPT_FANOUT_SHIFT;
+ nnode = c->nroot;
+ shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT;
+ for (h = 1; h < c->lpt_hght; h++) {
+ iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+ shft -= UBIFS_LPT_FANOUT_SHIFT;
+ nnode = ubifs_get_nnode(c, nnode, iip);
+ if (IS_ERR(nnode))
+ return ERR_PTR(PTR_ERR(nnode));
+ }
+ iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
+ return ubifs_get_pnode(c, nnode, iip);
+}
+
+/**
+ * add_pnode_dirt - add dirty space to LPT LEB properties.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode for which to add dirt
+ */
+static void add_pnode_dirt(struct ubifs_info *c, struct ubifs_pnode *pnode)
+{
+ ubifs_add_lpt_dirt(c, pnode->parent->nbranch[pnode->iip].lnum,
+ c->pnode_sz);
+}
+
+/**
+ * do_make_pnode_dirty - mark a pnode dirty.
+ * @c: UBIFS file-system description object
+ * @pnode: pnode to mark dirty
+ */
+static void do_make_pnode_dirty(struct ubifs_info *c, struct ubifs_pnode *pnode)
+{
+ /* Assumes cnext list is empty i.e. not called during commit */
+ if (!test_and_set_bit(DIRTY_CNODE, &pnode->flags)) {
+ struct ubifs_nnode *nnode;
+
+ c->dirty_pn_cnt += 1;
+ add_pnode_dirt(c, pnode);
+ /* Mark parent and ancestors dirty too */
+ nnode = pnode->parent;
+ while (nnode) {
+ if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) {
+ c->dirty_nn_cnt += 1;
+ ubifs_add_nnode_dirt(c, nnode);
+ nnode = nnode->parent;
+ } else
+ break;
+ }
+ }
+}
+
+/**
+ * make_tree_dirty - mark the entire LEB properties tree dirty.
+ * @c: UBIFS file-system description object
+ *
+ * This function is used by the "small" LPT model to cause the entire LEB
+ * properties tree to be written. The "small" LPT model does not use LPT
+ * garbage collection because it is more efficient to write the entire tree
+ * (because it is small).
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int make_tree_dirty(struct ubifs_info *c)
+{
+ struct ubifs_pnode *pnode;
+
+ pnode = pnode_lookup(c, 0);
+ while (pnode) {
+ do_make_pnode_dirty(c, pnode);
+ pnode = next_pnode(c, pnode);
+ if (IS_ERR(pnode))
+ return PTR_ERR(pnode);
+ }
+ return 0;
+}
+
+/**
+ * need_write_all - determine if the LPT area is running out of free space.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %1 if the LPT area is running out of free space and %0
+ * if it is not.
+ */
+static int need_write_all(struct ubifs_info *c)
+{
+ long long free = 0;
+ int i;
+
+ for (i = 0; i < c->lpt_lebs; i++) {
+ if (i + c->lpt_first == c->nhead_lnum)
+ free += c->leb_size - c->nhead_offs;
+ else if (c->ltab[i].free == c->leb_size)
+ free += c->leb_size;
+ else if (c->ltab[i].free + c->ltab[i].dirty == c->leb_size)
+ free += c->leb_size;
+ }
+ /* Less than twice the size left */
+ if (free <= c->lpt_sz * 2)
+ return 1;
+ return 0;
+}
+
+/**
+ * lpt_tgc_start - start trivial garbage collection of LPT LEBs.
+ * @c: UBIFS file-system description object
+ *
+ * LPT trivial garbage collection is where a LPT LEB contains only dirty and
+ * free space and so may be reused as soon as the next commit is completed.
+ * This function is called during start commit to mark LPT LEBs for trivial GC.
+ */
+static void lpt_tgc_start(struct ubifs_info *c)
+{
+ int i;
+
+ for (i = 0; i < c->lpt_lebs; i++) {
+ if (i + c->lpt_first == c->nhead_lnum)
+ continue;
+ if (c->ltab[i].dirty > 0 &&
+ c->ltab[i].free + c->ltab[i].dirty == c->leb_size) {
+ c->ltab[i].tgc = 1;
+ c->ltab[i].free = c->leb_size;
+ c->ltab[i].dirty = 0;
+ dbg_lp("LEB %d", i + c->lpt_first);
+ }
+ }
+}
+
+/**
+ * lpt_tgc_end - end trivial garbage collection of LPT LEBs.
+ * @c: UBIFS file-system description object
+ *
+ * LPT trivial garbage collection is where a LPT LEB contains only dirty and
+ * free space and so may be reused as soon as the next commit is completed.
+ * This function is called after the commit is completed (master node has been
+ * written) and unmaps LPT LEBs that were marked for trivial GC.
+ */
+static int lpt_tgc_end(struct ubifs_info *c)
+{
+ int i, err;
+
+ for (i = 0; i < c->lpt_lebs; i++)
+ if (c->ltab[i].tgc) {
+ err = ubifs_leb_unmap(c, i + c->lpt_first);
+ if (err)
+ return err;
+ c->ltab[i].tgc = 0;
+ dbg_lp("LEB %d", i + c->lpt_first);
+ }
+ return 0;
+}
+
+/**
+ * populate_lsave - fill the lsave array with important LEB numbers.
+ * @c: the UBIFS file-system description object
+ *
+ * This function is only called for the "big" model. It records a small number
+ * of LEB numbers of important LEBs. Important LEBs are ones that are (from
+ * most important to least important): empty, freeable, freeable index, dirty
+ * index, dirty or free. Upon mount, we read this list of LEB numbers and bring
+ * their pnodes into memory. That will stop us from having to scan the LPT
+ * straight away. For the "small" model we assume that scanning the LPT is no
+ * big deal.
+ */
+static void populate_lsave(struct ubifs_info *c)
+{
+ struct ubifs_lprops *lprops;
+ struct ubifs_lpt_heap *heap;
+ int i, cnt = 0;
+
+ ubifs_assert(c->big_lpt);
+ if (!(c->lpt_drty_flgs & LSAVE_DIRTY)) {
+ c->lpt_drty_flgs |= LSAVE_DIRTY;
+ ubifs_add_lpt_dirt(c, c->lsave_lnum, c->lsave_sz);
+ }
+ list_for_each_entry(lprops, &c->empty_list, list) {
+ c->lsave[cnt++] = lprops->lnum;
+ if (cnt >= c->lsave_cnt)
+ return;
+ }
+ list_for_each_entry(lprops, &c->freeable_list, list) {
+ c->lsave[cnt++] = lprops->lnum;
+ if (cnt >= c->lsave_cnt)
+ return;
+ }
+ list_for_each_entry(lprops, &c->frdi_idx_list, list) {
+ c->lsave[cnt++] = lprops->lnum;
+ if (cnt >= c->lsave_cnt)
+ return;
+ }
+ heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
+ for (i = 0; i < heap->cnt; i++) {
+ c->lsave[cnt++] = heap->arr[i]->lnum;
+ if (cnt >= c->lsave_cnt)
+ return;
+ }
+ heap = &c->lpt_heap[LPROPS_DIRTY - 1];
+ for (i = 0; i < heap->cnt; i++) {
+ c->lsave[cnt++] = heap->arr[i]->lnum;
+ if (cnt >= c->lsave_cnt)
+ return;
+ }
+ heap = &c->lpt_heap[LPROPS_FREE - 1];
+ for (i = 0; i < heap->cnt; i++) {
+ c->lsave[cnt++] = heap->arr[i]->lnum;
+ if (cnt >= c->lsave_cnt)
+ return;
+ }
+ /* Fill it up completely */
+ while (cnt < c->lsave_cnt)
+ c->lsave[cnt++] = c->main_first;
+}
+
+/**
+ * nnode_lookup - lookup a nnode in the LPT.
+ * @c: UBIFS file-system description object
+ * @i: nnode number
+ *
+ * This function returns a pointer to the nnode on success or a negative
+ * error code on failure.
+ */
+static struct ubifs_nnode *nnode_lookup(struct ubifs_info *c, int i)
+{
+ int err, iip;
+ struct ubifs_nnode *nnode;
+
+ if (!c->nroot) {
+ err = ubifs_read_nnode(c, NULL, 0);
+ if (err)
+ return ERR_PTR(err);
+ }
+ nnode = c->nroot;
+ while (1) {
+ iip = i & (UBIFS_LPT_FANOUT - 1);
+ i >>= UBIFS_LPT_FANOUT_SHIFT;
+ if (!i)
+ break;
+ nnode = ubifs_get_nnode(c, nnode, iip);
+ if (IS_ERR(nnode))
+ return nnode;
+ }
+ return nnode;
+}
+
+/**
+ * make_nnode_dirty - find a nnode and, if found, make it dirty.
+ * @c: UBIFS file-system description object
+ * @node_num: nnode number of nnode to make dirty
+ * @lnum: LEB number where nnode was written
+ * @offs: offset where nnode was written
+ *
+ * This function is used by LPT garbage collection. LPT garbage collection is
+ * used only for the "big" LPT model (c->big_lpt == 1). Garbage collection
+ * simply involves marking all the nodes in the LEB being garbage-collected as
+ * dirty. The dirty nodes are written next commit, after which the LEB is free
+ * to be reused.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int make_nnode_dirty(struct ubifs_info *c, int node_num, int lnum,
+ int offs)
+{
+ struct ubifs_nnode *nnode;
+
+ nnode = nnode_lookup(c, node_num);
+ if (IS_ERR(nnode))
+ return PTR_ERR(nnode);
+ if (nnode->parent) {
+ struct ubifs_nbranch *branch;
+
+ branch = &nnode->parent->nbranch[nnode->iip];
+ if (branch->lnum != lnum || branch->offs != offs)
+ return 0; /* nnode is obsolete */
+ } else if (c->lpt_lnum != lnum || c->lpt_offs != offs)
+ return 0; /* nnode is obsolete */
+ /* Assumes cnext list is empty i.e. not called during commit */
+ if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) {
+ c->dirty_nn_cnt += 1;
+ ubifs_add_nnode_dirt(c, nnode);
+ /* Mark parent and ancestors dirty too */
+ nnode = nnode->parent;
+ while (nnode) {
+ if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) {
+ c->dirty_nn_cnt += 1;
+ ubifs_add_nnode_dirt(c, nnode);
+ nnode = nnode->parent;
+ } else
+ break;
+ }
+ }
+ return 0;
+}
+
+/**
+ * make_pnode_dirty - find a pnode and, if found, make it dirty.
+ * @c: UBIFS file-system description object
+ * @node_num: pnode number of pnode to make dirty
+ * @lnum: LEB number where pnode was written
+ * @offs: offset where pnode was written
+ *
+ * This function is used by LPT garbage collection. LPT garbage collection is
+ * used only for the "big" LPT model (c->big_lpt == 1). Garbage collection
+ * simply involves marking all the nodes in the LEB being garbage-collected as
+ * dirty. The dirty nodes are written next commit, after which the LEB is free
+ * to be reused.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int make_pnode_dirty(struct ubifs_info *c, int node_num, int lnum,
+ int offs)
+{
+ struct ubifs_pnode *pnode;
+ struct ubifs_nbranch *branch;
+
+ pnode = pnode_lookup(c, node_num);
+ if (IS_ERR(pnode))
+ return PTR_ERR(pnode);
+ branch = &pnode->parent->nbranch[pnode->iip];
+ if (branch->lnum != lnum || branch->offs != offs)
+ return 0;
+ do_make_pnode_dirty(c, pnode);
+ return 0;
+}
+
+/**
+ * make_ltab_dirty - make ltab node dirty.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number where ltab was written
+ * @offs: offset where ltab was written
+ *
+ * This function is used by LPT garbage collection. LPT garbage collection is
+ * used only for the "big" LPT model (c->big_lpt == 1). Garbage collection
+ * simply involves marking all the nodes in the LEB being garbage-collected as
+ * dirty. The dirty nodes are written next commit, after which the LEB is free
+ * to be reused.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int make_ltab_dirty(struct ubifs_info *c, int lnum, int offs)
+{
+ if (lnum != c->ltab_lnum || offs != c->ltab_offs)
+ return 0; /* This ltab node is obsolete */
+ if (!(c->lpt_drty_flgs & LTAB_DIRTY)) {
+ c->lpt_drty_flgs |= LTAB_DIRTY;
+ ubifs_add_lpt_dirt(c, c->ltab_lnum, c->ltab_sz);
+ }
+ return 0;
+}
+
+/**
+ * make_lsave_dirty - make lsave node dirty.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number where lsave was written
+ * @offs: offset where lsave was written
+ *
+ * This function is used by LPT garbage collection. LPT garbage collection is
+ * used only for the "big" LPT model (c->big_lpt == 1). Garbage collection
+ * simply involves marking all the nodes in the LEB being garbage-collected as
+ * dirty. The dirty nodes are written next commit, after which the LEB is free
+ * to be reused.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int make_lsave_dirty(struct ubifs_info *c, int lnum, int offs)
+{
+ if (lnum != c->lsave_lnum || offs != c->lsave_offs)
+ return 0; /* This lsave node is obsolete */
+ if (!(c->lpt_drty_flgs & LSAVE_DIRTY)) {
+ c->lpt_drty_flgs |= LSAVE_DIRTY;
+ ubifs_add_lpt_dirt(c, c->lsave_lnum, c->lsave_sz);
+ }
+ return 0;
+}
+
+/**
+ * make_node_dirty - make node dirty.
+ * @c: UBIFS file-system description object
+ * @node_type: LPT node type
+ * @node_num: node number
+ * @lnum: LEB number where node was written
+ * @offs: offset where node was written
+ *
+ * This function is used by LPT garbage collection. LPT garbage collection is
+ * used only for the "big" LPT model (c->big_lpt == 1). Garbage collection
+ * simply involves marking all the nodes in the LEB being garbage-collected as
+ * dirty. The dirty nodes are written next commit, after which the LEB is free
+ * to be reused.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int make_node_dirty(struct ubifs_info *c, int node_type, int node_num,
+ int lnum, int offs)
+{
+ switch (node_type) {
+ case UBIFS_LPT_NNODE:
+ return make_nnode_dirty(c, node_num, lnum, offs);
+ case UBIFS_LPT_PNODE:
+ return make_pnode_dirty(c, node_num, lnum, offs);
+ case UBIFS_LPT_LTAB:
+ return make_ltab_dirty(c, lnum, offs);
+ case UBIFS_LPT_LSAVE:
+ return make_lsave_dirty(c, lnum, offs);
+ }
+ return -EINVAL;
+}
+
+/**
+ * get_lpt_node_len - return the length of a node based on its type.
+ * @c: UBIFS file-system description object
+ * @node_type: LPT node type
+ */
+static int get_lpt_node_len(struct ubifs_info *c, int node_type)
+{
+ switch (node_type) {
+ case UBIFS_LPT_NNODE:
+ return c->nnode_sz;
+ case UBIFS_LPT_PNODE:
+ return c->pnode_sz;
+ case UBIFS_LPT_LTAB:
+ return c->ltab_sz;
+ case UBIFS_LPT_LSAVE:
+ return c->lsave_sz;
+ }
+ return 0;
+}
+
+/**
+ * get_pad_len - return the length of padding in a buffer.
+ * @c: UBIFS file-system description object
+ * @buf: buffer
+ * @len: length of buffer
+ */
+static int get_pad_len(struct ubifs_info *c, uint8_t *buf, int len)
+{
+ int offs, pad_len;
+
+ if (c->min_io_size == 1)
+ return 0;
+ offs = c->leb_size - len;
+ pad_len = ALIGN(offs, c->min_io_size) - offs;
+ return pad_len;
+}
+
+/**
+ * get_lpt_node_type - return type (and node number) of a node in a buffer.
+ * @c: UBIFS file-system description object
+ * @buf: buffer
+ * @node_num: node number is returned here
+ */
+static int get_lpt_node_type(struct ubifs_info *c, uint8_t *buf, int *node_num)
+{
+ uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+ int pos = 0, node_type;
+
+ node_type = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_TYPE_BITS);
+ *node_num = ubifs_unpack_bits(&addr, &pos, c->pcnt_bits);
+ return node_type;
+}
+
+/**
+ * is_a_node - determine if a buffer contains a node.
+ * @c: UBIFS file-system description object
+ * @buf: buffer
+ * @len: length of buffer
+ *
+ * This function returns %1 if the buffer contains a node or %0 if it does not.
+ */
+static int is_a_node(struct ubifs_info *c, uint8_t *buf, int len)
+{
+ uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
+ int pos = 0, node_type, node_len;
+ uint16_t crc, calc_crc;
+
+ node_type = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_TYPE_BITS);
+ if (node_type == UBIFS_LPT_NOT_A_NODE)
+ return 0;
+ node_len = get_lpt_node_len(c, node_type);
+ if (!node_len || node_len > len)
+ return 0;
+ pos = 0;
+ addr = buf;
+ crc = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_CRC_BITS);
+ calc_crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES,
+ node_len - UBIFS_LPT_CRC_BYTES);
+ if (crc != calc_crc)
+ return 0;
+ return 1;
+}
+
+
+/**
+ * lpt_gc_lnum - garbage collect a LPT LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to garbage collect
+ *
+ * LPT garbage collection is used only for the "big" LPT model
+ * (c->big_lpt == 1). Garbage collection simply involves marking all the nodes
+ * in the LEB being garbage-collected as dirty. The dirty nodes are written
+ * next commit, after which the LEB is free to be reused.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int lpt_gc_lnum(struct ubifs_info *c, int lnum)
+{
+ int err, len = c->leb_size, node_type, node_num, node_len, offs;
+ void *buf = c->lpt_buf;
+
+ dbg_lp("LEB %d", lnum);
+ err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
+ if (err) {
+ ubifs_err("cannot read LEB %d, error %d", lnum, err);
+ return err;
+ }
+ while (1) {
+ if (!is_a_node(c, buf, len)) {
+ int pad_len;
+
+ pad_len = get_pad_len(c, buf, len);
+ if (pad_len) {
+ buf += pad_len;
+ len -= pad_len;
+ continue;
+ }
+ return 0;
+ }
+ node_type = get_lpt_node_type(c, buf, &node_num);
+ node_len = get_lpt_node_len(c, node_type);
+ offs = c->leb_size - len;
+ ubifs_assert(node_len != 0);
+ mutex_lock(&c->lp_mutex);
+ err = make_node_dirty(c, node_type, node_num, lnum, offs);
+ mutex_unlock(&c->lp_mutex);
+ if (err)
+ return err;
+ buf += node_len;
+ len -= node_len;
+ }
+ return 0;
+}
+
+/**
+ * lpt_gc - LPT garbage collection.
+ * @c: UBIFS file-system description object
+ *
+ * Select a LPT LEB for LPT garbage collection and call 'lpt_gc_lnum()'.
+ * Returns %0 on success and a negative error code on failure.
+ */
+static int lpt_gc(struct ubifs_info *c)
+{
+ int i, lnum = -1, dirty = 0;
+
+ mutex_lock(&c->lp_mutex);
+ for (i = 0; i < c->lpt_lebs; i++) {
+ ubifs_assert(!c->ltab[i].tgc);
+ if (i + c->lpt_first == c->nhead_lnum ||
+ c->ltab[i].free + c->ltab[i].dirty == c->leb_size)
+ continue;
+ if (c->ltab[i].dirty > dirty) {
+ dirty = c->ltab[i].dirty;
+ lnum = i + c->lpt_first;
+ }
+ }
+ mutex_unlock(&c->lp_mutex);
+ if (lnum == -1)
+ return -ENOSPC;
+ return lpt_gc_lnum(c, lnum);
+}
+
+/**
+ * ubifs_lpt_start_commit - UBIFS commit starts.
+ * @c: the UBIFS file-system description object
+ *
+ * This function has to be called when UBIFS starts the commit operation.
+ * This function "freezes" all currently dirty LEB properties and does not
+ * change them anymore. Further changes are saved and tracked separately
+ * because they are not part of this commit. This function returns zero in case
+ * of success and a negative error code in case of failure.
+ */
+int ubifs_lpt_start_commit(struct ubifs_info *c)
+{
+ int err, cnt;
+
+ dbg_lp("");
+
+ mutex_lock(&c->lp_mutex);
+ err = dbg_check_ltab(c);
+ if (err)
+ goto out;
+
+ if (c->check_lpt_free) {
+ /*
+ * We ensure there is enough free space in
+ * ubifs_lpt_post_commit() by marking nodes dirty. That
+ * information is lost when we unmount, so we also need
+ * to check free space once after mounting also.
+ */
+ c->check_lpt_free = 0;
+ while (need_write_all(c)) {
+ mutex_unlock(&c->lp_mutex);
+ err = lpt_gc(c);
+ if (err)
+ return err;
+ mutex_lock(&c->lp_mutex);
+ }
+ }
+
+ lpt_tgc_start(c);
+
+ if (!c->dirty_pn_cnt) {
+ dbg_cmt("no cnodes to commit");
+ err = 0;
+ goto out;
+ }
+
+ if (!c->big_lpt && need_write_all(c)) {
+ /* If needed, write everything */
+ err = make_tree_dirty(c);
+ if (err)
+ goto out;
+ lpt_tgc_start(c);
+ }
+
+ if (c->big_lpt)
+ populate_lsave(c);
+
+ cnt = get_cnodes_to_commit(c);
+ ubifs_assert(cnt != 0);
+
+ err = layout_cnodes(c);
+ if (err)
+ goto out;
+
+ /* Copy the LPT's own lprops for end commit to write */
+ memcpy(c->ltab_cmt, c->ltab,
+ sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs);
+ c->lpt_drty_flgs &= ~(LTAB_DIRTY | LSAVE_DIRTY);
+
+out:
+ mutex_unlock(&c->lp_mutex);
+ return err;
+}
+
+/**
+ * free_obsolete_cnodes - free obsolete cnodes for commit end.
+ * @c: UBIFS file-system description object
+ */
+static void free_obsolete_cnodes(struct ubifs_info *c)
+{
+ struct ubifs_cnode *cnode, *cnext;
+
+ cnext = c->lpt_cnext;
+ if (!cnext)
+ return;
+ do {
+ cnode = cnext;
+ cnext = cnode->cnext;
+ if (test_bit(OBSOLETE_CNODE, &cnode->flags))
+ kfree(cnode);
+ else
+ cnode->cnext = NULL;
+ } while (cnext != c->lpt_cnext);
+ c->lpt_cnext = NULL;
+}
+
+/**
+ * ubifs_lpt_end_commit - finish the commit operation.
+ * @c: the UBIFS file-system description object
+ *
+ * This function has to be called when the commit operation finishes. It
+ * flushes the changes which were "frozen" by 'ubifs_lprops_start_commit()' to
+ * the media. Returns zero in case of success and a negative error code in case
+ * of failure.
+ */
+int ubifs_lpt_end_commit(struct ubifs_info *c)
+{
+ int err;
+
+ dbg_lp("");
+
+ if (!c->lpt_cnext)
+ return 0;
+
+ err = write_cnodes(c);
+ if (err)
+ return err;
+
+ mutex_lock(&c->lp_mutex);
+ free_obsolete_cnodes(c);
+ mutex_unlock(&c->lp_mutex);
+
+ return 0;
+}
+
+/**
+ * ubifs_lpt_post_commit - post commit LPT trivial GC and LPT GC.
+ * @c: UBIFS file-system description object
+ *
+ * LPT trivial GC is completed after a commit. Also LPT GC is done after a
+ * commit for the "big" LPT model.
+ */
+int ubifs_lpt_post_commit(struct ubifs_info *c)
+{
+ int err;
+
+ mutex_lock(&c->lp_mutex);
+ err = lpt_tgc_end(c);
+ if (err)
+ goto out;
+ if (c->big_lpt)
+ while (need_write_all(c)) {
+ mutex_unlock(&c->lp_mutex);
+ err = lpt_gc(c);
+ if (err)
+ return err;
+ mutex_lock(&c->lp_mutex);
+ }
+out:
+ mutex_unlock(&c->lp_mutex);
+ return err;
+}
+
+/**
+ * first_nnode - find the first nnode in memory.
+ * @c: UBIFS file-system description object
+ * @hght: height of tree where nnode found is returned here
+ *
+ * This function returns a pointer to the nnode found or %NULL if no nnode is
+ * found. This function is a helper to 'ubifs_lpt_free()'.
+ */
+static struct ubifs_nnode *first_nnode(struct ubifs_info *c, int *hght)
+{
+ struct ubifs_nnode *nnode;
+ int h, i, found;
+
+ nnode = c->nroot;
+ *hght = 0;
+ if (!nnode)
+ return NULL;
+ for (h = 1; h < c->lpt_hght; h++) {
+ found = 0;
+ for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+ if (nnode->nbranch[i].nnode) {
+ found = 1;
+ nnode = nnode->nbranch[i].nnode;
+ *hght = h;
+ break;
+ }
+ }
+ if (!found)
+ break;
+ }
+ return nnode;
+}
+
+/**
+ * next_nnode - find the next nnode in memory.
+ * @c: UBIFS file-system description object
+ * @nnode: nnode from which to start.
+ * @hght: height of tree where nnode is, is passed and returned here
+ *
+ * This function returns a pointer to the nnode found or %NULL if no nnode is
+ * found. This function is a helper to 'ubifs_lpt_free()'.
+ */
+static struct ubifs_nnode *next_nnode(struct ubifs_info *c,
+ struct ubifs_nnode *nnode, int *hght)
+{
+ struct ubifs_nnode *parent;
+ int iip, h, i, found;
+
+ parent = nnode->parent;
+ if (!parent)
+ return NULL;
+ if (nnode->iip == UBIFS_LPT_FANOUT - 1) {
+ *hght -= 1;
+ return parent;
+ }
+ for (iip = nnode->iip + 1; iip < UBIFS_LPT_FANOUT; iip++) {
+ nnode = parent->nbranch[iip].nnode;
+ if (nnode)
+ break;
+ }
+ if (!nnode) {
+ *hght -= 1;
+ return parent;
+ }
+ for (h = *hght + 1; h < c->lpt_hght; h++) {
+ found = 0;
+ for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+ if (nnode->nbranch[i].nnode) {
+ found = 1;
+ nnode = nnode->nbranch[i].nnode;
+ *hght = h;
+ break;
+ }
+ }
+ if (!found)
+ break;
+ }
+ return nnode;
+}
+
+/**
+ * ubifs_lpt_free - free resources owned by the LPT.
+ * @c: UBIFS file-system description object
+ * @wr_only: free only resources used for writing
+ */
+void ubifs_lpt_free(struct ubifs_info *c, int wr_only)
+{
+ struct ubifs_nnode *nnode;
+ int i, hght;
+
+ /* Free write-only things first */
+
+ free_obsolete_cnodes(c); /* Leftover from a failed commit */
+
+ vfree(c->ltab_cmt);
+ c->ltab_cmt = NULL;
+ vfree(c->lpt_buf);
+ c->lpt_buf = NULL;
+ kfree(c->lsave);
+ c->lsave = NULL;
+
+ if (wr_only)
+ return;
+
+ /* Now free the rest */
+
+ nnode = first_nnode(c, &hght);
+ while (nnode) {
+ for (i = 0; i < UBIFS_LPT_FANOUT; i++)
+ kfree(nnode->nbranch[i].nnode);
+ nnode = next_nnode(c, nnode, &hght);
+ }
+ for (i = 0; i < LPROPS_HEAP_CNT; i++)
+ kfree(c->lpt_heap[i].arr);
+ kfree(c->dirty_idx.arr);
+ kfree(c->nroot);
+ vfree(c->ltab);
+ kfree(c->lpt_nod_buf);
+}
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+
+/**
+ * dbg_is_all_ff - determine if a buffer contains only 0xff bytes.
+ * @buf: buffer
+ * @len: buffer length
+ */
+static int dbg_is_all_ff(uint8_t *buf, int len)
+{
+ int i;
+
+ for (i = 0; i < len; i++)
+ if (buf[i] != 0xff)
+ return 0;
+ return 1;
+}
+
+/**
+ * dbg_is_nnode_dirty - determine if a nnode is dirty.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB number where nnode was written
+ * @offs: offset where nnode was written
+ */
+static int dbg_is_nnode_dirty(struct ubifs_info *c, int lnum, int offs)
+{
+ struct ubifs_nnode *nnode;
+ int hght;
+
+ /* Entire tree is in memory so first_nnode / next_nnode are ok */
+ nnode = first_nnode(c, &hght);
+ for (; nnode; nnode = next_nnode(c, nnode, &hght)) {
+ struct ubifs_nbranch *branch;
+
+ cond_resched();
+ if (nnode->parent) {
+ branch = &nnode->parent->nbranch[nnode->iip];
+ if (branch->lnum != lnum || branch->offs != offs)
+ continue;
+ if (test_bit(DIRTY_CNODE, &nnode->flags))
+ return 1;
+ return 0;
+ } else {
+ if (c->lpt_lnum != lnum || c->lpt_offs != offs)
+ continue;
+ if (test_bit(DIRTY_CNODE, &nnode->flags))
+ return 1;
+ return 0;
+ }
+ }
+ return 1;
+}
+
+/**
+ * dbg_is_pnode_dirty - determine if a pnode is dirty.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB number where pnode was written
+ * @offs: offset where pnode was written
+ */
+static int dbg_is_pnode_dirty(struct ubifs_info *c, int lnum, int offs)
+{
+ int i, cnt;
+
+ cnt = DIV_ROUND_UP(c->main_lebs, UBIFS_LPT_FANOUT);
+ for (i = 0; i < cnt; i++) {
+ struct ubifs_pnode *pnode;
+ struct ubifs_nbranch *branch;
+
+ cond_resched();
+ pnode = pnode_lookup(c, i);
+ if (IS_ERR(pnode))
+ return PTR_ERR(pnode);
+ branch = &pnode->parent->nbranch[pnode->iip];
+ if (branch->lnum != lnum || branch->offs != offs)
+ continue;
+ if (test_bit(DIRTY_CNODE, &pnode->flags))
+ return 1;
+ return 0;
+ }
+ return 1;
+}
+
+/**
+ * dbg_is_ltab_dirty - determine if a ltab node is dirty.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB number where ltab node was written
+ * @offs: offset where ltab node was written
+ */
+static int dbg_is_ltab_dirty(struct ubifs_info *c, int lnum, int offs)
+{
+ if (lnum != c->ltab_lnum || offs != c->ltab_offs)
+ return 1;
+ return (c->lpt_drty_flgs & LTAB_DIRTY) != 0;
+}
+
+/**
+ * dbg_is_lsave_dirty - determine if a lsave node is dirty.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB number where lsave node was written
+ * @offs: offset where lsave node was written
+ */
+static int dbg_is_lsave_dirty(struct ubifs_info *c, int lnum, int offs)
+{
+ if (lnum != c->lsave_lnum || offs != c->lsave_offs)
+ return 1;
+ return (c->lpt_drty_flgs & LSAVE_DIRTY) != 0;
+}
+
+/**
+ * dbg_is_node_dirty - determine if a node is dirty.
+ * @c: the UBIFS file-system description object
+ * @node_type: node type
+ * @lnum: LEB number where node was written
+ * @offs: offset where node was written
+ */
+static int dbg_is_node_dirty(struct ubifs_info *c, int node_type, int lnum,
+ int offs)
+{
+ switch (node_type) {
+ case UBIFS_LPT_NNODE:
+ return dbg_is_nnode_dirty(c, lnum, offs);
+ case UBIFS_LPT_PNODE:
+ return dbg_is_pnode_dirty(c, lnum, offs);
+ case UBIFS_LPT_LTAB:
+ return dbg_is_ltab_dirty(c, lnum, offs);
+ case UBIFS_LPT_LSAVE:
+ return dbg_is_lsave_dirty(c, lnum, offs);
+ }
+ return 1;
+}
+
+/**
+ * dbg_check_ltab_lnum - check the ltab for a LPT LEB number.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB number where node was written
+ * @offs: offset where node was written
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum)
+{
+ int err, len = c->leb_size, dirty = 0, node_type, node_num, node_len;
+ int ret;
+ void *buf = c->dbg_buf;
+
+ dbg_lp("LEB %d", lnum);
+ err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
+ if (err) {
+ dbg_msg("ubi_read failed, LEB %d, error %d", lnum, err);
+ return err;
+ }
+ while (1) {
+ if (!is_a_node(c, buf, len)) {
+ int i, pad_len;
+
+ pad_len = get_pad_len(c, buf, len);
+ if (pad_len) {
+ buf += pad_len;
+ len -= pad_len;
+ dirty += pad_len;
+ continue;
+ }
+ if (!dbg_is_all_ff(buf, len)) {
+ dbg_msg("invalid empty space in LEB %d at %d",
+ lnum, c->leb_size - len);
+ err = -EINVAL;
+ }
+ i = lnum - c->lpt_first;
+ if (len != c->ltab[i].free) {
+ dbg_msg("invalid free space in LEB %d "
+ "(free %d, expected %d)",
+ lnum, len, c->ltab[i].free);
+ err = -EINVAL;
+ }
+ if (dirty != c->ltab[i].dirty) {
+ dbg_msg("invalid dirty space in LEB %d "
+ "(dirty %d, expected %d)",
+ lnum, dirty, c->ltab[i].dirty);
+ err = -EINVAL;
+ }
+ return err;
+ }
+ node_type = get_lpt_node_type(c, buf, &node_num);
+ node_len = get_lpt_node_len(c, node_type);
+ ret = dbg_is_node_dirty(c, node_type, lnum, c->leb_size - len);
+ if (ret == 1)
+ dirty += node_len;
+ buf += node_len;
+ len -= node_len;
+ }
+}
+
+/**
+ * dbg_check_ltab - check the free and dirty space in the ltab.
+ * @c: the UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int dbg_check_ltab(struct ubifs_info *c)
+{
+ int lnum, err, i, cnt;
+
+ if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
+ return 0;
+
+ /* Bring the entire tree into memory */
+ cnt = DIV_ROUND_UP(c->main_lebs, UBIFS_LPT_FANOUT);
+ for (i = 0; i < cnt; i++) {
+ struct ubifs_pnode *pnode;
+
+ pnode = pnode_lookup(c, i);
+ if (IS_ERR(pnode))
+ return PTR_ERR(pnode);
+ cond_resched();
+ }
+
+ /* Check nodes */
+ err = dbg_check_lpt_nodes(c, (struct ubifs_cnode *)c->nroot, 0, 0);
+ if (err)
+ return err;
+
+ /* Check each LEB */
+ for (lnum = c->lpt_first; lnum <= c->lpt_last; lnum++) {
+ err = dbg_check_ltab_lnum(c, lnum);
+ if (err) {
+ dbg_err("failed at LEB %d", lnum);
+ return err;
+ }
+ }
+
+ dbg_lp("succeeded");
+ return 0;
+}
+
+#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
new file mode 100644
index 00000000000..71d5493bf56
--- /dev/null
+++ b/fs/ubifs/master.c
@@ -0,0 +1,387 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ * Adrian Hunter
+ */
+
+/* This file implements reading and writing the master node */
+
+#include "ubifs.h"
+
+/**
+ * scan_for_master - search the valid master node.
+ * @c: UBIFS file-system description object
+ *
+ * This function scans the master node LEBs and search for the latest master
+ * node. Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+static int scan_for_master(struct ubifs_info *c)
+{
+ struct ubifs_scan_leb *sleb;
+ struct ubifs_scan_node *snod;
+ int lnum, offs = 0, nodes_cnt;
+
+ lnum = UBIFS_MST_LNUM;
+
+ sleb = ubifs_scan(c, lnum, 0, c->sbuf);
+ if (IS_ERR(sleb))
+ return PTR_ERR(sleb);
+ nodes_cnt = sleb->nodes_cnt;
+ if (nodes_cnt > 0) {
+ snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node,
+ list);
+ if (snod->type != UBIFS_MST_NODE)
+ goto out;
+ memcpy(c->mst_node, snod->node, snod->len);
+ offs = snod->offs;
+ }
+ ubifs_scan_destroy(sleb);
+
+ lnum += 1;
+
+ sleb = ubifs_scan(c, lnum, 0, c->sbuf);
+ if (IS_ERR(sleb))
+ return PTR_ERR(sleb);
+ if (sleb->nodes_cnt != nodes_cnt)
+ goto out;
+ if (!sleb->nodes_cnt)
+ goto out;
+ snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, list);
+ if (snod->type != UBIFS_MST_NODE)
+ goto out;
+ if (snod->offs != offs)
+ goto out;
+ if (memcmp((void *)c->mst_node + UBIFS_CH_SZ,
+ (void *)snod->node + UBIFS_CH_SZ,
+ UBIFS_MST_NODE_SZ - UBIFS_CH_SZ))
+ goto out;
+ c->mst_offs = offs;
+ ubifs_scan_destroy(sleb);
+ return 0;
+
+out:
+ ubifs_scan_destroy(sleb);
+ return -EINVAL;
+}
+
+/**
+ * validate_master - validate master node.
+ * @c: UBIFS file-system description object
+ *
+ * This function validates data which was read from master node. Returns zero
+ * if the data is all right and %-EINVAL if not.
+ */
+static int validate_master(const struct ubifs_info *c)
+{
+ long long main_sz;
+ int err;
+
+ if (c->max_sqnum >= SQNUM_WATERMARK) {
+ err = 1;
+ goto out;
+ }
+
+ if (c->cmt_no >= c->max_sqnum) {
+ err = 2;
+ goto out;
+ }
+
+ if (c->highest_inum >= INUM_WATERMARK) {
+ err = 3;
+ goto out;
+ }
+
+ if (c->lhead_lnum < UBIFS_LOG_LNUM ||
+ c->lhead_lnum >= UBIFS_LOG_LNUM + c->log_lebs ||
+ c->lhead_offs < 0 || c->lhead_offs >= c->leb_size ||
+ c->lhead_offs & (c->min_io_size - 1)) {
+ err = 4;
+ goto out;
+ }
+
+ if (c->zroot.lnum >= c->leb_cnt || c->zroot.lnum < c->main_first ||
+ c->zroot.offs >= c->leb_size || c->zroot.offs & 7) {
+ err = 5;
+ goto out;
+ }
+
+ if (c->zroot.len < c->ranges[UBIFS_IDX_NODE].min_len ||
+ c->zroot.len > c->ranges[UBIFS_IDX_NODE].max_len) {
+ err = 6;
+ goto out;
+ }
+
+ if (c->gc_lnum >= c->leb_cnt || c->gc_lnum < c->main_first) {
+ err = 7;
+ goto out;
+ }
+
+ if (c->ihead_lnum >= c->leb_cnt || c->ihead_lnum < c->main_first ||
+ c->ihead_offs % c->min_io_size || c->ihead_offs < 0 ||
+ c->ihead_offs > c->leb_size || c->ihead_offs & 7) {
+ err = 8;
+ goto out;
+ }
+
+ main_sz = (long long)c->main_lebs * c->leb_size;
+ if (c->old_idx_sz & 7 || c->old_idx_sz >= main_sz) {
+ err = 9;
+ goto out;
+ }
+
+ if (c->lpt_lnum < c->lpt_first || c->lpt_lnum > c->lpt_last ||
+ c->lpt_offs < 0 || c->lpt_offs + c->nnode_sz > c->leb_size) {
+ err = 10;
+ goto out;
+ }
+
+ if (c->nhead_lnum < c->lpt_first || c->nhead_lnum > c->lpt_last ||
+ c->nhead_offs < 0 || c->nhead_offs % c->min_io_size ||
+ c->nhead_offs > c->leb_size) {
+ err = 11;
+ goto out;
+ }
+
+ if (c->ltab_lnum < c->lpt_first || c->ltab_lnum > c->lpt_last ||
+ c->ltab_offs < 0 ||
+ c->ltab_offs + c->ltab_sz > c->leb_size) {
+ err = 12;
+ goto out;
+ }
+
+ if (c->big_lpt && (c->lsave_lnum < c->lpt_first ||
+ c->lsave_lnum > c->lpt_last || c->lsave_offs < 0 ||
+ c->lsave_offs + c->lsave_sz > c->leb_size)) {
+ err = 13;
+ goto out;
+ }
+
+ if (c->lscan_lnum < c->main_first || c->lscan_lnum >= c->leb_cnt) {
+ err = 14;
+ goto out;
+ }
+
+ if (c->lst.empty_lebs < 0 || c->lst.empty_lebs > c->main_lebs - 2) {
+ err = 15;
+ goto out;
+ }
+
+ if (c->lst.idx_lebs < 0 || c->lst.idx_lebs > c->main_lebs - 1) {
+ err = 16;
+ goto out;
+ }
+
+ if (c->lst.total_free < 0 || c->lst.total_free > main_sz ||
+ c->lst.total_free & 7) {
+ err = 17;
+ goto out;
+ }
+
+ if (c->lst.total_dirty < 0 || (c->lst.total_dirty & 7)) {
+ err = 18;
+ goto out;
+ }
+
+ if (c->lst.total_used < 0 || (c->lst.total_used & 7)) {
+ err = 19;
+ goto out;
+ }
+
+ if (c->lst.total_free + c->lst.total_dirty +
+ c->lst.total_used > main_sz) {
+ err = 20;
+ goto out;
+ }
+
+ if (c->lst.total_dead + c->lst.total_dark +
+ c->lst.total_used + c->old_idx_sz > main_sz) {
+ err = 21;
+ goto out;
+ }
+
+ if (c->lst.total_dead < 0 ||
+ c->lst.total_dead > c->lst.total_free + c->lst.total_dirty ||
+ c->lst.total_dead & 7) {
+ err = 22;
+ goto out;
+ }
+
+ if (c->lst.total_dark < 0 ||
+ c->lst.total_dark > c->lst.total_free + c->lst.total_dirty ||
+ c->lst.total_dark & 7) {
+ err = 23;
+ goto out;
+ }
+
+ return 0;
+
+out:
+ ubifs_err("bad master node at offset %d error %d", c->mst_offs, err);
+ dbg_dump_node(c, c->mst_node);
+ return -EINVAL;
+}
+
+/**
+ * ubifs_read_master - read master node.
+ * @c: UBIFS file-system description object
+ *
+ * This function finds and reads the master node during file-system mount. If
+ * the flash is empty, it creates default master node as well. Returns zero in
+ * case of success and a negative error code in case of failure.
+ */
+int ubifs_read_master(struct ubifs_info *c)
+{
+ int err, old_leb_cnt;
+
+ c->mst_node = kzalloc(c->mst_node_alsz, GFP_KERNEL);
+ if (!c->mst_node)
+ return -ENOMEM;
+
+ err = scan_for_master(c);
+ if (err) {
+ err = ubifs_recover_master_node(c);
+ if (err)
+ /*
+ * Note, we do not free 'c->mst_node' here because the
+ * unmount routine will take care of this.
+ */
+ return err;
+ }
+
+ /* Make sure that the recovery flag is clear */
+ c->mst_node->flags &= cpu_to_le32(~UBIFS_MST_RCVRY);
+
+ c->max_sqnum = le64_to_cpu(c->mst_node->ch.sqnum);
+ c->highest_inum = le64_to_cpu(c->mst_node->highest_inum);
+ c->cmt_no = le64_to_cpu(c->mst_node->cmt_no);
+ c->zroot.lnum = le32_to_cpu(c->mst_node->root_lnum);
+ c->zroot.offs = le32_to_cpu(c->mst_node->root_offs);
+ c->zroot.len = le32_to_cpu(c->mst_node->root_len);
+ c->lhead_lnum = le32_to_cpu(c->mst_node->log_lnum);
+ c->gc_lnum = le32_to_cpu(c->mst_node->gc_lnum);
+ c->ihead_lnum = le32_to_cpu(c->mst_node->ihead_lnum);
+ c->ihead_offs = le32_to_cpu(c->mst_node->ihead_offs);
+ c->old_idx_sz = le64_to_cpu(c->mst_node->index_size);
+ c->lpt_lnum = le32_to_cpu(c->mst_node->lpt_lnum);
+ c->lpt_offs = le32_to_cpu(c->mst_node->lpt_offs);
+ c->nhead_lnum = le32_to_cpu(c->mst_node->nhead_lnum);
+ c->nhead_offs = le32_to_cpu(c->mst_node->nhead_offs);
+ c->ltab_lnum = le32_to_cpu(c->mst_node->ltab_lnum);
+ c->ltab_offs = le32_to_cpu(c->mst_node->ltab_offs);
+ c->lsave_lnum = le32_to_cpu(c->mst_node->lsave_lnum);
+ c->lsave_offs = le32_to_cpu(c->mst_node->lsave_offs);
+ c->lscan_lnum = le32_to_cpu(c->mst_node->lscan_lnum);
+ c->lst.empty_lebs = le32_to_cpu(c->mst_node->empty_lebs);
+ c->lst.idx_lebs = le32_to_cpu(c->mst_node->idx_lebs);
+ old_leb_cnt = le32_to_cpu(c->mst_node->leb_cnt);
+ c->lst.total_free = le64_to_cpu(c->mst_node->total_free);
+ c->lst.total_dirty = le64_to_cpu(c->mst_node->total_dirty);
+ c->lst.total_used = le64_to_cpu(c->mst_node->total_used);
+ c->lst.total_dead = le64_to_cpu(c->mst_node->total_dead);
+ c->lst.total_dark = le64_to_cpu(c->mst_node->total_dark);
+
+ c->calc_idx_sz = c->old_idx_sz;
+
+ if (c->mst_node->flags & cpu_to_le32(UBIFS_MST_NO_ORPHS))
+ c->no_orphs = 1;
+
+ if (old_leb_cnt != c->leb_cnt) {
+ /* The file system has been resized */
+ int growth = c->leb_cnt - old_leb_cnt;
+
+ if (c->leb_cnt < old_leb_cnt ||
+ c->leb_cnt < UBIFS_MIN_LEB_CNT) {
+ ubifs_err("bad leb_cnt on master node");
+ dbg_dump_node(c, c->mst_node);
+ return -EINVAL;
+ }
+
+ dbg_mnt("Auto resizing (master) from %d LEBs to %d LEBs",
+ old_leb_cnt, c->leb_cnt);
+ c->lst.empty_lebs += growth;
+ c->lst.total_free += growth * (long long)c->leb_size;
+ c->lst.total_dark += growth * (long long)c->dark_wm;
+
+ /*
+ * Reflect changes back onto the master node. N.B. the master
+ * node gets written immediately whenever mounting (or
+ * remounting) in read-write mode, so we do not need to write it
+ * here.
+ */
+ c->mst_node->leb_cnt = cpu_to_le32(c->leb_cnt);
+ c->mst_node->empty_lebs = cpu_to_le32(c->lst.empty_lebs);
+ c->mst_node->total_free = cpu_to_le64(c->lst.total_free);
+ c->mst_node->total_dark = cpu_to_le64(c->lst.total_dark);
+ }
+
+ err = validate_master(c);
+ if (err)
+ return err;
+
+ err = dbg_old_index_check_init(c, &c->zroot);
+
+ return err;
+}
+
+/**
+ * ubifs_write_master - write master node.
+ * @c: UBIFS file-system description object
+ *
+ * This function writes the master node. The caller has to take the
+ * @c->mst_mutex lock before calling this function. Returns zero in case of
+ * success and a negative error code in case of failure. The master node is
+ * written twice to enable recovery.
+ */
+int ubifs_write_master(struct ubifs_info *c)
+{
+ int err, lnum, offs, len;
+
+ if (c->ro_media)
+ return -EINVAL;
+
+ lnum = UBIFS_MST_LNUM;
+ offs = c->mst_offs + c->mst_node_alsz;
+ len = UBIFS_MST_NODE_SZ;
+
+ if (offs + UBIFS_MST_NODE_SZ > c->leb_size) {
+ err = ubifs_leb_unmap(c, lnum);
+ if (err)
+ return err;
+ offs = 0;
+ }
+
+ c->mst_offs = offs;
+ c->mst_node->highest_inum = cpu_to_le64(c->highest_inum);
+
+ err = ubifs_write_node(c, c->mst_node, len, lnum, offs, UBI_SHORTTERM);
+ if (err)
+ return err;
+
+ lnum += 1;
+
+ if (offs == 0) {
+ err = ubifs_leb_unmap(c, lnum);
+ if (err)
+ return err;
+ }
+ err = ubifs_write_node(c, c->mst_node, len, lnum, offs, UBI_SHORTTERM);
+
+ return err;
+}
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
new file mode 100644
index 00000000000..4c12a9215d7
--- /dev/null
+++ b/fs/ubifs/misc.h
@@ -0,0 +1,313 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ * Adrian Hunter
+ */
+
+/*
+ * This file contains miscellaneous helper functions.
+ */
+
+#ifndef __UBIFS_MISC_H__
+#define __UBIFS_MISC_H__
+
+/**
+ * ubifs_zn_dirty - check if znode is dirty.
+ * @znode: znode to check
+ *
+ * This helper function returns %1 if @znode is dirty and %0 otherwise.
+ */
+static inline int ubifs_zn_dirty(const struct ubifs_znode *znode)
+{
+ return !!test_bit(DIRTY_ZNODE, &znode->flags);
+}
+
+/**
+ * ubifs_wake_up_bgt - wake up background thread.
+ * @c: UBIFS file-system description object
+ */
+static inline void ubifs_wake_up_bgt(struct ubifs_info *c)
+{
+ if (c->bgt && !c->need_bgt) {
+ c->need_bgt = 1;
+ wake_up_process(c->bgt);
+ }
+}
+
+/**
+ * ubifs_tnc_find_child - find next child in znode.
+ * @znode: znode to search at
+ * @start: the zbranch index to start at
+ *
+ * This helper function looks for znode child starting at index @start. Returns
+ * the child or %NULL if no children were found.
+ */
+static inline struct ubifs_znode *
+ubifs_tnc_find_child(struct ubifs_znode *znode, int start)
+{
+ while (start < znode->child_cnt) {
+ if (znode->zbranch[start].znode)
+ return znode->zbranch[start].znode;
+ start += 1;
+ }
+
+ return NULL;
+}
+
+/**
+ * ubifs_inode - get UBIFS inode information by VFS 'struct inode' object.
+ * @inode: the VFS 'struct inode' pointer
+ */
+static inline struct ubifs_inode *ubifs_inode(const struct inode *inode)
+{
+ return container_of(inode, struct ubifs_inode, vfs_inode);
+}
+
+/**
+ * ubifs_compr_present - check if compressor was compiled in.
+ * @compr_type: compressor type to check
+ *
+ * This function returns %1 of compressor of type @compr_type is present, and
+ * %0 if not.
+ */
+static inline int ubifs_compr_present(int compr_type)
+{
+ ubifs_assert(compr_type >= 0 && compr_type < UBIFS_COMPR_TYPES_CNT);
+ return !!ubifs_compressors[compr_type]->capi_name;
+}
+
+/**
+ * ubifs_compr_name - get compressor name string by its type.
+ * @compr_type: compressor type
+ *
+ * This function returns compressor type string.
+ */
+static inline const char *ubifs_compr_name(int compr_type)
+{
+ ubifs_assert(compr_type >= 0 && compr_type < UBIFS_COMPR_TYPES_CNT);
+ return ubifs_compressors[compr_type]->name;
+}
+
+/**
+ * ubifs_wbuf_sync - synchronize write-buffer.
+ * @wbuf: write-buffer to synchronize
+ *
+ * This is the same as as 'ubifs_wbuf_sync_nolock()' but it does not assume
+ * that the write-buffer is already locked.
+ */
+static inline int ubifs_wbuf_sync(struct ubifs_wbuf *wbuf)
+{
+ int err;
+
+ mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+ err = ubifs_wbuf_sync_nolock(wbuf);
+ mutex_unlock(&wbuf->io_mutex);
+ return err;
+}
+
+/**
+ * ubifs_leb_unmap - unmap an LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to unmap
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static inline int ubifs_leb_unmap(const struct ubifs_info *c, int lnum)
+{
+ int err;
+
+ if (c->ro_media)
+ return -EROFS;
+ err = ubi_leb_unmap(c->ubi, lnum);
+ if (err) {
+ ubifs_err("unmap LEB %d failed, error %d", lnum, err);
+ return err;
+ }
+
+ return 0;
+}
+
+/**
+ * ubifs_leb_write - write to a LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to write
+ * @buf: buffer to write from
+ * @offs: offset within LEB to write to
+ * @len: length to write
+ * @dtype: data type
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static inline int ubifs_leb_write(const struct ubifs_info *c, int lnum,
+ const void *buf, int offs, int len, int dtype)
+{
+ int err;
+
+ if (c->ro_media)
+ return -EROFS;
+ err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype);
+ if (err) {
+ ubifs_err("writing %d bytes at %d:%d, error %d",
+ len, lnum, offs, err);
+ return err;
+ }
+
+ return 0;
+}
+
+/**
+ * ubifs_leb_change - atomic LEB change.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to write
+ * @buf: buffer to write from
+ * @len: length to write
+ * @dtype: data type
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static inline int ubifs_leb_change(const struct ubifs_info *c, int lnum,
+ const void *buf, int len, int dtype)
+{
+ int err;
+
+ if (c->ro_media)
+ return -EROFS;
+ err = ubi_leb_change(c->ubi, lnum, buf, len, dtype);
+ if (err) {
+ ubifs_err("changing %d bytes in LEB %d, error %d",
+ len, lnum, err);
+ return err;
+ }
+
+ return 0;
+}
+
+/**
+ * ubifs_encode_dev - encode device node IDs.
+ * @dev: UBIFS device node information
+ * @rdev: device IDs to encode
+ *
+ * This is a helper function which encodes major/minor numbers of a device node
+ * into UBIFS device node description. We use standard Linux "new" and "huge"
+ * encodings.
+ */
+static inline int ubifs_encode_dev(union ubifs_dev_desc *dev, dev_t rdev)
+{
+ if (new_valid_dev(rdev)) {
+ dev->new = cpu_to_le32(new_encode_dev(rdev));
+ return sizeof(dev->new);
+ } else {
+ dev->huge = cpu_to_le64(huge_encode_dev(rdev));
+ return sizeof(dev->huge);
+ }
+}
+
+/**
+ * ubifs_add_dirt - add dirty space to LEB properties.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB to add dirty space for
+ * @dirty: dirty space to add
+ *
+ * This is a helper function which increased amount of dirty LEB space. Returns
+ * zero in case of success and a negative error code in case of failure.
+ */
+static inline int ubifs_add_dirt(struct ubifs_info *c, int lnum, int dirty)
+{
+ return ubifs_update_one_lp(c, lnum, LPROPS_NC, dirty, 0, 0);
+}
+
+/**
+ * ubifs_return_leb - return LEB to lprops.
+ * @c: the UBIFS file-system description object
+ * @lnum: LEB to return
+ *
+ * This helper function cleans the "taken" flag of a logical eraseblock in the
+ * lprops. Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+static inline int ubifs_return_leb(struct ubifs_info *c, int lnum)
+{
+ return ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
+ LPROPS_TAKEN, 0);
+}
+
+/**
+ * ubifs_idx_node_sz - return index node size.
+ * @c: the UBIFS file-system description object
+ * @child_cnt: number of children of this index node
+ */
+static inline int ubifs_idx_node_sz(const struct ubifs_info *c, int child_cnt)
+{
+ return UBIFS_IDX_NODE_SZ + (UBIFS_BRANCH_SZ + c->key_len) * child_cnt;
+}
+
+/**
+ * ubifs_idx_branch - return pointer to an index branch.
+ * @c: the UBIFS file-system description object
+ * @idx: index node
+ * @bnum: branch number
+ */
+static inline
+struct ubifs_branch *ubifs_idx_branch(const struct ubifs_info *c,
+ const struct ubifs_idx_node *idx,
+ int bnum)
+{
+ return (struct ubifs_branch *)((void *)idx->branches +
+ (UBIFS_BRANCH_SZ + c->key_len) * bnum);
+}
+
+/**
+ * ubifs_idx_key - return pointer to an index key.
+ * @c: the UBIFS file-system description object
+ * @idx: index node
+ */
+static inline void *ubifs_idx_key(const struct ubifs_info *c,
+ const struct ubifs_idx_node *idx)
+{
+ return (void *)((struct ubifs_branch *)idx->branches)->key;
+}
+
+/**
+ * ubifs_current_time - round current time to time granularity.
+ * @inode: inode
+ */
+static inline struct timespec ubifs_current_time(struct inode *inode)
+{
+ return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ?
+ current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
+}
+
+/**
+ * ubifs_tnc_lookup - look up a file-system node.
+ * @c: UBIFS file-system description object
+ * @key: node key to lookup
+ * @node: the node is returned here
+ *
+ * This function look up and reads node with key @key. The caller has to make
+ * sure the @node buffer is large enough to fit the node. Returns zero in case
+ * of success, %-ENOENT if the node was not found, and a negative error code in
+ * case of failure.
+ */
+static inline int ubifs_tnc_lookup(struct ubifs_info *c,
+ const union ubifs_key *key, void *node)
+{
+ return ubifs_tnc_locate(c, key, node, NULL, NULL);
+}
+
+#endif /* __UBIFS_MISC_H__ */
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
new file mode 100644
index 00000000000..02d3462f4d3
--- /dev/null
+++ b/fs/ubifs/orphan.c
@@ -0,0 +1,958 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Author: Adrian Hunter
+ */
+
+#include "ubifs.h"
+
+/*
+ * An orphan is an inode number whose inode node has been committed to the index
+ * with a link count of zero. That happens when an open file is deleted
+ * (unlinked) and then a commit is run. In the normal course of events the inode
+ * would be deleted when the file is closed. However in the case of an unclean
+ * unmount, orphans need to be accounted for. After an unclean unmount, the
+ * orphans' inodes must be deleted which means either scanning the entire index
+ * looking for them, or keeping a list on flash somewhere. This unit implements
+ * the latter approach.
+ *
+ * The orphan area is a fixed number of LEBs situated between the LPT area and
+ * the main area. The number of orphan area LEBs is specified when the file
+ * system is created. The minimum number is 1. The size of the orphan area
+ * should be so that it can hold the maximum number of orphans that are expected
+ * to ever exist at one time.
+ *
+ * The number of orphans that can fit in a LEB is:
+ *
+ * (c->leb_size - UBIFS_ORPH_NODE_SZ) / sizeof(__le64)
+ *
+ * For example: a 15872 byte LEB can fit 1980 orphans so 1 LEB may be enough.
+ *
+ * Orphans are accumulated in a rb-tree. When an inode's link count drops to
+ * zero, the inode number is added to the rb-tree. It is removed from the tree
+ * when the inode is deleted. Any new orphans that are in the orphan tree when
+ * the commit is run, are written to the orphan area in 1 or more orph nodes.
+ * If the orphan area is full, it is consolidated to make space. There is
+ * always enough space because validation prevents the user from creating more
+ * than the maximum number of orphans allowed.
+ */
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+static int dbg_check_orphans(struct ubifs_info *c);
+#else
+#define dbg_check_orphans(c) 0
+#endif
+
+/**
+ * ubifs_add_orphan - add an orphan.
+ * @c: UBIFS file-system description object
+ * @inum: orphan inode number
+ *
+ * Add an orphan. This function is called when an inodes link count drops to
+ * zero.
+ */
+int ubifs_add_orphan(struct ubifs_info *c, ino_t inum)
+{
+ struct ubifs_orphan *orphan, *o;
+ struct rb_node **p, *parent = NULL;
+
+ orphan = kzalloc(sizeof(struct ubifs_orphan), GFP_NOFS);
+ if (!orphan)
+ return -ENOMEM;
+ orphan->inum = inum;
+ orphan->new = 1;
+
+ spin_lock(&c->orphan_lock);
+ if (c->tot_orphans >= c->max_orphans) {
+ spin_unlock(&c->orphan_lock);
+ kfree(orphan);
+ return -ENFILE;
+ }
+ p = &c->orph_tree.rb_node;
+ while (*p) {
+ parent = *p;
+ o = rb_entry(parent, struct ubifs_orphan, rb);
+ if (inum < o->inum)
+ p = &(*p)->rb_left;
+ else if (inum > o->inum)
+ p = &(*p)->rb_right;
+ else {
+ dbg_err("orphaned twice");
+ spin_unlock(&c->orphan_lock);
+ kfree(orphan);
+ return 0;
+ }
+ }
+ c->tot_orphans += 1;
+ c->new_orphans += 1;
+ rb_link_node(&orphan->rb, parent, p);
+ rb_insert_color(&orphan->rb, &c->orph_tree);
+ list_add_tail(&orphan->list, &c->orph_list);
+ list_add_tail(&orphan->new_list, &c->orph_new);
+ spin_unlock(&c->orphan_lock);
+ dbg_gen("ino %lu", inum);
+ return 0;
+}
+
+/**
+ * ubifs_delete_orphan - delete an orphan.
+ * @c: UBIFS file-system description object
+ * @inum: orphan inode number
+ *
+ * Delete an orphan. This function is called when an inode is deleted.
+ */
+void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum)
+{
+ struct ubifs_orphan *o;
+ struct rb_node *p;
+
+ spin_lock(&c->orphan_lock);
+ p = c->orph_tree.rb_node;
+ while (p) {
+ o = rb_entry(p, struct ubifs_orphan, rb);
+ if (inum < o->inum)
+ p = p->rb_left;
+ else if (inum > o->inum)
+ p = p->rb_right;
+ else {
+ if (o->dnext) {
+ spin_unlock(&c->orphan_lock);
+ dbg_gen("deleted twice ino %lu", inum);
+ return;
+ }
+ if (o->cnext) {
+ o->dnext = c->orph_dnext;
+ c->orph_dnext = o;
+ spin_unlock(&c->orphan_lock);
+ dbg_gen("delete later ino %lu", inum);
+ return;
+ }
+ rb_erase(p, &c->orph_tree);
+ list_del(&o->list);
+ c->tot_orphans -= 1;
+ if (o->new) {
+ list_del(&o->new_list);
+ c->new_orphans -= 1;
+ }
+ spin_unlock(&c->orphan_lock);
+ kfree(o);
+ dbg_gen("inum %lu", inum);
+ return;
+ }
+ }
+ spin_unlock(&c->orphan_lock);
+ dbg_err("missing orphan ino %lu", inum);
+ dbg_dump_stack();
+}
+
+/**
+ * ubifs_orphan_start_commit - start commit of orphans.
+ * @c: UBIFS file-system description object
+ *
+ * Start commit of orphans.
+ */
+int ubifs_orphan_start_commit(struct ubifs_info *c)
+{
+ struct ubifs_orphan *orphan, **last;
+
+ spin_lock(&c->orphan_lock);
+ last = &c->orph_cnext;
+ list_for_each_entry(orphan, &c->orph_new, new_list) {
+ ubifs_assert(orphan->new);
+ orphan->new = 0;
+ *last = orphan;
+ last = &orphan->cnext;
+ }
+ *last = orphan->cnext;
+ c->cmt_orphans = c->new_orphans;
+ c->new_orphans = 0;
+ dbg_cmt("%d orphans to commit", c->cmt_orphans);
+ INIT_LIST_HEAD(&c->orph_new);
+ if (c->tot_orphans == 0)
+ c->no_orphs = 1;
+ else
+ c->no_orphs = 0;
+ spin_unlock(&c->orphan_lock);
+ return 0;
+}
+
+/**
+ * avail_orphs - calculate available space.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns the number of orphans that can be written in the
+ * available space.
+ */
+static int avail_orphs(struct ubifs_info *c)
+{
+ int avail_lebs, avail, gap;
+
+ avail_lebs = c->orph_lebs - (c->ohead_lnum - c->orph_first) - 1;
+ avail = avail_lebs *
+ ((c->leb_size - UBIFS_ORPH_NODE_SZ) / sizeof(__le64));
+ gap = c->leb_size - c->ohead_offs;
+ if (gap >= UBIFS_ORPH_NODE_SZ + sizeof(__le64))
+ avail += (gap - UBIFS_ORPH_NODE_SZ) / sizeof(__le64);
+ return avail;
+}
+
+/**
+ * tot_avail_orphs - calculate total space.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns the number of orphans that can be written in half
+ * the total space. That leaves half the space for adding new orphans.
+ */
+static int tot_avail_orphs(struct ubifs_info *c)
+{
+ int avail_lebs, avail;
+
+ avail_lebs = c->orph_lebs;
+ avail = avail_lebs *
+ ((c->leb_size - UBIFS_ORPH_NODE_SZ) / sizeof(__le64));
+ return avail / 2;
+}
+
+/**
+ * do_write_orph_node - write a node
+ * @c: UBIFS file-system description object
+ * @len: length of node
+ * @atomic: write atomically
+ *
+ * This function writes a node to the orphan head from the orphan buffer. If
+ * %atomic is not zero, then the write is done atomically. On success, %0 is
+ * returned, otherwise a negative error code is returned.
+ */
+static int do_write_orph_node(struct ubifs_info *c, int len, int atomic)
+{
+ int err = 0;
+
+ if (atomic) {
+ ubifs_assert(c->ohead_offs == 0);
+ ubifs_prepare_node(c, c->orph_buf, len, 1);
+ len = ALIGN(len, c->min_io_size);
+ err = ubifs_leb_change(c, c->ohead_lnum, c->orph_buf, len,
+ UBI_SHORTTERM);
+ } else {
+ if (c->ohead_offs == 0) {
+ /* Ensure LEB has been unmapped */
+ err = ubifs_leb_unmap(c, c->ohead_lnum);
+ if (err)
+ return err;
+ }
+ err = ubifs_write_node(c, c->orph_buf, len, c->ohead_lnum,
+ c->ohead_offs, UBI_SHORTTERM);
+ }
+ return err;
+}
+
+/**
+ * write_orph_node - write an orph node
+ * @c: UBIFS file-system description object
+ * @atomic: write atomically
+ *
+ * This function builds an orph node from the cnext list and writes it to the
+ * orphan head. On success, %0 is returned, otherwise a negative error code
+ * is returned.
+ */
+static int write_orph_node(struct ubifs_info *c, int atomic)
+{
+ struct ubifs_orphan *orphan, *cnext;
+ struct ubifs_orph_node *orph;
+ int gap, err, len, cnt, i;
+
+ ubifs_assert(c->cmt_orphans > 0);
+ gap = c->leb_size - c->ohead_offs;
+ if (gap < UBIFS_ORPH_NODE_SZ + sizeof(__le64)) {
+ c->ohead_lnum += 1;
+ c->ohead_offs = 0;
+ gap = c->leb_size;
+ if (c->ohead_lnum > c->orph_last) {
+ /*
+ * We limit the number of orphans so that this should
+ * never happen.
+ */
+ ubifs_err("out of space in orphan area");
+ return -EINVAL;
+ }
+ }
+ cnt = (gap - UBIFS_ORPH_NODE_SZ) / sizeof(__le64);
+ if (cnt > c->cmt_orphans)
+ cnt = c->cmt_orphans;
+ len = UBIFS_ORPH_NODE_SZ + cnt * sizeof(__le64);
+ ubifs_assert(c->orph_buf);
+ orph = c->orph_buf;
+ orph->ch.node_type = UBIFS_ORPH_NODE;
+ spin_lock(&c->orphan_lock);
+ cnext = c->orph_cnext;
+ for (i = 0; i < cnt; i++) {
+ orphan = cnext;
+ orph->inos[i] = cpu_to_le64(orphan->inum);
+ cnext = orphan->cnext;
+ orphan->cnext = NULL;
+ }
+ c->orph_cnext = cnext;
+ c->cmt_orphans -= cnt;
+ spin_unlock(&c->orphan_lock);
+ if (c->cmt_orphans)
+ orph->cmt_no = cpu_to_le64(c->cmt_no);
+ else
+ /* Mark the last node of the commit */
+ orph->cmt_no = cpu_to_le64((c->cmt_no) | (1ULL << 63));
+ ubifs_assert(c->ohead_offs + len <= c->leb_size);
+ ubifs_assert(c->ohead_lnum >= c->orph_first);
+ ubifs_assert(c->ohead_lnum <= c->orph_last);
+ err = do_write_orph_node(c, len, atomic);
+ c->ohead_offs += ALIGN(len, c->min_io_size);
+ c->ohead_offs = ALIGN(c->ohead_offs, 8);
+ return err;
+}
+
+/**
+ * write_orph_nodes - write orph nodes until there are no more to commit
+ * @c: UBIFS file-system description object
+ * @atomic: write atomically
+ *
+ * This function writes orph nodes for all the orphans to commit. On success,
+ * %0 is returned, otherwise a negative error code is returned.
+ */
+static int write_orph_nodes(struct ubifs_info *c, int atomic)
+{
+ int err;
+
+ while (c->cmt_orphans > 0) {
+ err = write_orph_node(c, atomic);
+ if (err)
+ return err;
+ }
+ if (atomic) {
+ int lnum;
+
+ /* Unmap any unused LEBs after consolidation */
+ lnum = c->ohead_lnum + 1;
+ for (lnum = c->ohead_lnum + 1; lnum <= c->orph_last; lnum++) {
+ err = ubifs_leb_unmap(c, lnum);
+ if (err)
+ return err;
+ }
+ }
+ return 0;
+}
+
+/**
+ * consolidate - consolidate the orphan area.
+ * @c: UBIFS file-system description object
+ *
+ * This function enables consolidation by putting all the orphans into the list
+ * to commit. The list is in the order that the orphans were added, and the
+ * LEBs are written atomically in order, so at no time can orphans be lost by
+ * an unclean unmount.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int consolidate(struct ubifs_info *c)
+{
+ int tot_avail = tot_avail_orphs(c), err = 0;
+
+ spin_lock(&c->orphan_lock);
+ dbg_cmt("there is space for %d orphans and there are %d",
+ tot_avail, c->tot_orphans);
+ if (c->tot_orphans - c->new_orphans <= tot_avail) {
+ struct ubifs_orphan *orphan, **last;
+ int cnt = 0;
+
+ /* Change the cnext list to include all non-new orphans */
+ last = &c->orph_cnext;
+ list_for_each_entry(orphan, &c->orph_list, list) {
+ if (orphan->new)
+ continue;
+ *last = orphan;
+ last = &orphan->cnext;
+ cnt += 1;
+ }
+ *last = orphan->cnext;
+ ubifs_assert(cnt == c->tot_orphans - c->new_orphans);
+ c->cmt_orphans = cnt;
+ c->ohead_lnum = c->orph_first;
+ c->ohead_offs = 0;
+ } else {
+ /*
+ * We limit the number of orphans so that this should
+ * never happen.
+ */
+ ubifs_err("out of space in orphan area");
+ err = -EINVAL;
+ }
+ spin_unlock(&c->orphan_lock);
+ return err;
+}
+
+/**
+ * commit_orphans - commit orphans.
+ * @c: UBIFS file-system description object
+ *
+ * This function commits orphans to flash. On success, %0 is returned,
+ * otherwise a negative error code is returned.
+ */
+static int commit_orphans(struct ubifs_info *c)
+{
+ int avail, atomic = 0, err;
+
+ ubifs_assert(c->cmt_orphans > 0);
+ avail = avail_orphs(c);
+ if (avail < c->cmt_orphans) {
+ /* Not enough space to write new orphans, so consolidate */
+ err = consolidate(c);
+ if (err)
+ return err;
+ atomic = 1;
+ }
+ err = write_orph_nodes(c, atomic);
+ return err;
+}
+
+/**
+ * erase_deleted - erase the orphans marked for deletion.
+ * @c: UBIFS file-system description object
+ *
+ * During commit, the orphans being committed cannot be deleted, so they are
+ * marked for deletion and deleted by this function. Also, the recovery
+ * adds killed orphans to the deletion list, and therefore they are deleted
+ * here too.
+ */
+static void erase_deleted(struct ubifs_info *c)
+{
+ struct ubifs_orphan *orphan, *dnext;
+
+ spin_lock(&c->orphan_lock);
+ dnext = c->orph_dnext;
+ while (dnext) {
+ orphan = dnext;
+ dnext = orphan->dnext;
+ ubifs_assert(!orphan->new);
+ rb_erase(&orphan->rb, &c->orph_tree);
+ list_del(&orphan->list);
+ c->tot_orphans -= 1;
+ dbg_gen("deleting orphan ino %lu", orphan->inum);
+ kfree(orphan);
+ }
+ c->orph_dnext = NULL;
+ spin_unlock(&c->orphan_lock);
+}
+
+/**
+ * ubifs_orphan_end_commit - end commit of orphans.
+ * @c: UBIFS file-system description object
+ *
+ * End commit of orphans.
+ */
+int ubifs_orphan_end_commit(struct ubifs_info *c)
+{
+ int err;
+
+ if (c->cmt_orphans != 0) {
+ err = commit_orphans(c);
+ if (err)
+ return err;
+ }
+ erase_deleted(c);
+ err = dbg_check_orphans(c);
+ return err;
+}
+
+/**
+ * clear_orphans - erase all LEBs used for orphans.
+ * @c: UBIFS file-system description object
+ *
+ * If recovery is not required, then the orphans from the previous session
+ * are not needed. This function locates the LEBs used to record
+ * orphans, and un-maps them.
+ */
+static int clear_orphans(struct ubifs_info *c)
+{
+ int lnum, err;
+
+ for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
+ err = ubifs_leb_unmap(c, lnum);
+ if (err)
+ return err;
+ }
+ c->ohead_lnum = c->orph_first;
+ c->ohead_offs = 0;
+ return 0;
+}
+
+/**
+ * insert_dead_orphan - insert an orphan.
+ * @c: UBIFS file-system description object
+ * @inum: orphan inode number
+ *
+ * This function is a helper to the 'do_kill_orphans()' function. The orphan
+ * must be kept until the next commit, so it is added to the rb-tree and the
+ * deletion list.
+ */
+static int insert_dead_orphan(struct ubifs_info *c, ino_t inum)
+{
+ struct ubifs_orphan *orphan, *o;
+ struct rb_node **p, *parent = NULL;
+
+ orphan = kzalloc(sizeof(struct ubifs_orphan), GFP_KERNEL);
+ if (!orphan)
+ return -ENOMEM;
+ orphan->inum = inum;
+
+ p = &c->orph_tree.rb_node;
+ while (*p) {
+ parent = *p;
+ o = rb_entry(parent, struct ubifs_orphan, rb);
+ if (inum < o->inum)
+ p = &(*p)->rb_left;
+ else if (inum > o->inum)
+ p = &(*p)->rb_right;
+ else {
+ /* Already added - no problem */
+ kfree(orphan);
+ return 0;
+ }
+ }
+ c->tot_orphans += 1;
+ rb_link_node(&orphan->rb, parent, p);
+ rb_insert_color(&orphan->rb, &c->orph_tree);
+ list_add_tail(&orphan->list, &c->orph_list);
+ orphan->dnext = c->orph_dnext;
+ c->orph_dnext = orphan;
+ dbg_mnt("ino %lu, new %d, tot %d",
+ inum, c->new_orphans, c->tot_orphans);
+ return 0;
+}
+
+/**
+ * do_kill_orphans - remove orphan inodes from the index.
+ * @c: UBIFS file-system description object
+ * @sleb: scanned LEB
+ * @last_cmt_no: cmt_no of last orph node read is passed and returned here
+ * @outofdate: whether the LEB is out of date is returned here
+ * @last_flagged: whether the end orph node is encountered
+ *
+ * This function is a helper to the 'kill_orphans()' function. It goes through
+ * every orphan node in a LEB and for every inode number recorded, removes
+ * all keys for that inode from the TNC.
+ */
+static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+ unsigned long long *last_cmt_no, int *outofdate,
+ int *last_flagged)
+{
+ struct ubifs_scan_node *snod;
+ struct ubifs_orph_node *orph;
+ unsigned long long cmt_no;
+ ino_t inum;
+ int i, n, err, first = 1;
+
+ list_for_each_entry(snod, &sleb->nodes, list) {
+ if (snod->type != UBIFS_ORPH_NODE) {
+ ubifs_err("invalid node type %d in orphan area at "
+ "%d:%d", snod->type, sleb->lnum, snod->offs);
+ dbg_dump_node(c, snod->node);
+ return -EINVAL;
+ }
+
+ orph = snod->node;
+
+ /* Check commit number */
+ cmt_no = le64_to_cpu(orph->cmt_no) & LLONG_MAX;
+ /*
+ * The commit number on the master node may be less, because
+ * of a failed commit. If there are several failed commits in a
+ * row, the commit number written on orph nodes will continue to
+ * increase (because the commit number is adjusted here) even
+ * though the commit number on the master node stays the same
+ * because the master node has not been re-written.
+ */
+ if (cmt_no > c->cmt_no)
+ c->cmt_no = cmt_no;
+ if (cmt_no < *last_cmt_no && *last_flagged) {
+ /*
+ * The last orph node had a higher commit number and was
+ * flagged as the last written for that commit number.
+ * That makes this orph node, out of date.
+ */
+ if (!first) {
+ ubifs_err("out of order commit number %llu in "
+ "orphan node at %d:%d",
+ cmt_no, sleb->lnum, snod->offs);
+ dbg_dump_node(c, snod->node);
+ return -EINVAL;
+ }
+ dbg_rcvry("out of date LEB %d", sleb->lnum);
+ *outofdate = 1;
+ return 0;
+ }
+
+ if (first)
+ first = 0;
+
+ n = (le32_to_cpu(orph->ch.len) - UBIFS_ORPH_NODE_SZ) >> 3;
+ for (i = 0; i < n; i++) {
+ inum = le64_to_cpu(orph->inos[i]);
+ dbg_rcvry("deleting orphaned inode %lu", inum);
+ err = ubifs_tnc_remove_ino(c, inum);
+ if (err)
+ return err;
+ err = insert_dead_orphan(c, inum);
+ if (err)
+ return err;
+ }
+
+ *last_cmt_no = cmt_no;
+ if (le64_to_cpu(orph->cmt_no) & (1ULL << 63)) {
+ dbg_rcvry("last orph node for commit %llu at %d:%d",
+ cmt_no, sleb->lnum, snod->offs);
+ *last_flagged = 1;
+ } else
+ *last_flagged = 0;
+ }
+
+ return 0;
+}
+
+/**
+ * kill_orphans - remove all orphan inodes from the index.
+ * @c: UBIFS file-system description object
+ *
+ * If recovery is required, then orphan inodes recorded during the previous
+ * session (which ended with an unclean unmount) must be deleted from the index.
+ * This is done by updating the TNC, but since the index is not updated until
+ * the next commit, the LEBs where the orphan information is recorded are not
+ * erased until the next commit.
+ */
+static int kill_orphans(struct ubifs_info *c)
+{
+ unsigned long long last_cmt_no = 0;
+ int lnum, err = 0, outofdate = 0, last_flagged = 0;
+
+ c->ohead_lnum = c->orph_first;
+ c->ohead_offs = 0;
+ /* Check no-orphans flag and skip this if no orphans */
+ if (c->no_orphs) {
+ dbg_rcvry("no orphans");
+ return 0;
+ }
+ /*
+ * Orph nodes always start at c->orph_first and are written to each
+ * successive LEB in turn. Generally unused LEBs will have been unmapped
+ * but may contain out of date orph nodes if the unmap didn't go
+ * through. In addition, the last orph node written for each commit is
+ * marked (top bit of orph->cmt_no is set to 1). It is possible that
+ * there are orph nodes from the next commit (i.e. the commit did not
+ * complete successfully). In that case, no orphans will have been lost
+ * due to the way that orphans are written, and any orphans added will
+ * be valid orphans anyway and so can be deleted.
+ */
+ for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
+ struct ubifs_scan_leb *sleb;
+
+ dbg_rcvry("LEB %d", lnum);
+ sleb = ubifs_scan(c, lnum, 0, c->sbuf);
+ if (IS_ERR(sleb)) {
+ sleb = ubifs_recover_leb(c, lnum, 0, c->sbuf, 0);
+ if (IS_ERR(sleb)) {
+ err = PTR_ERR(sleb);
+ break;
+ }
+ }
+ err = do_kill_orphans(c, sleb, &last_cmt_no, &outofdate,
+ &last_flagged);
+ if (err || outofdate) {
+ ubifs_scan_destroy(sleb);
+ break;
+ }
+ if (sleb->endpt) {
+ c->ohead_lnum = lnum;
+ c->ohead_offs = sleb->endpt;
+ }
+ ubifs_scan_destroy(sleb);
+ }
+ return err;
+}
+
+/**
+ * ubifs_mount_orphans - delete orphan inodes and erase LEBs that recorded them.
+ * @c: UBIFS file-system description object
+ * @unclean: indicates recovery from unclean unmount
+ * @read_only: indicates read only mount
+ *
+ * This function is called when mounting to erase orphans from the previous
+ * session. If UBIFS was not unmounted cleanly, then the inodes recorded as
+ * orphans are deleted.
+ */
+int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only)
+{
+ int err = 0;
+
+ c->max_orphans = tot_avail_orphs(c);
+
+ if (!read_only) {
+ c->orph_buf = vmalloc(c->leb_size);
+ if (!c->orph_buf)
+ return -ENOMEM;
+ }
+
+ if (unclean)
+ err = kill_orphans(c);
+ else if (!read_only)
+ err = clear_orphans(c);
+
+ return err;
+}
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+
+struct check_orphan {
+ struct rb_node rb;
+ ino_t inum;
+};
+
+struct check_info {
+ unsigned long last_ino;
+ unsigned long tot_inos;
+ unsigned long missing;
+ unsigned long long leaf_cnt;
+ struct ubifs_ino_node *node;
+ struct rb_root root;
+};
+
+static int dbg_find_orphan(struct ubifs_info *c, ino_t inum)
+{
+ struct ubifs_orphan *o;
+ struct rb_node *p;
+
+ spin_lock(&c->orphan_lock);
+ p = c->orph_tree.rb_node;
+ while (p) {
+ o = rb_entry(p, struct ubifs_orphan, rb);
+ if (inum < o->inum)
+ p = p->rb_left;
+ else if (inum > o->inum)
+ p = p->rb_right;
+ else {
+ spin_unlock(&c->orphan_lock);
+ return 1;
+ }
+ }
+ spin_unlock(&c->orphan_lock);
+ return 0;
+}
+
+static int dbg_ins_check_orphan(struct rb_root *root, ino_t inum)
+{
+ struct check_orphan *orphan, *o;
+ struct rb_node **p, *parent = NULL;
+
+ orphan = kzalloc(sizeof(struct check_orphan), GFP_NOFS);
+ if (!orphan)
+ return -ENOMEM;
+ orphan->inum = inum;
+
+ p = &root->rb_node;
+ while (*p) {
+ parent = *p;
+ o = rb_entry(parent, struct check_orphan, rb);
+ if (inum < o->inum)
+ p = &(*p)->rb_left;
+ else if (inum > o->inum)
+ p = &(*p)->rb_right;
+ else {
+ kfree(orphan);
+ return 0;
+ }
+ }
+ rb_link_node(&orphan->rb, parent, p);
+ rb_insert_color(&orphan->rb, root);
+ return 0;
+}
+
+static int dbg_find_check_orphan(struct rb_root *root, ino_t inum)
+{
+ struct check_orphan *o;
+ struct rb_node *p;
+
+ p = root->rb_node;
+ while (p) {
+ o = rb_entry(p, struct check_orphan, rb);
+ if (inum < o->inum)
+ p = p->rb_left;
+ else if (inum > o->inum)
+ p = p->rb_right;
+ else
+ return 1;
+ }
+ return 0;
+}
+
+static void dbg_free_check_tree(struct rb_root *root)
+{
+ struct rb_node *this = root->rb_node;
+ struct check_orphan *o;
+
+ while (this) {
+ if (this->rb_left) {
+ this = this->rb_left;
+ continue;
+ } else if (this->rb_right) {
+ this = this->rb_right;
+ continue;
+ }
+ o = rb_entry(this, struct check_orphan, rb);
+ this = rb_parent(this);
+ if (this) {
+ if (this->rb_left == &o->rb)
+ this->rb_left = NULL;
+ else
+ this->rb_right = NULL;
+ }
+ kfree(o);
+ }
+}
+
+static int dbg_orphan_check(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+ void *priv)
+{
+ struct check_info *ci = priv;
+ ino_t inum;
+ int err;
+
+ inum = key_inum(c, &zbr->key);
+ if (inum != ci->last_ino) {
+ /* Lowest node type is the inode node, so it comes first */
+ if (key_type(c, &zbr->key) != UBIFS_INO_KEY)
+ ubifs_err("found orphan node ino %lu, type %d", inum,
+ key_type(c, &zbr->key));
+ ci->last_ino = inum;
+ ci->tot_inos += 1;
+ err = ubifs_tnc_read_node(c, zbr, ci->node);
+ if (err) {
+ ubifs_err("node read failed, error %d", err);
+ return err;
+ }
+ if (ci->node->nlink == 0)
+ /* Must be recorded as an orphan */
+ if (!dbg_find_check_orphan(&ci->root, inum) &&
+ !dbg_find_orphan(c, inum)) {
+ ubifs_err("missing orphan, ino %lu", inum);
+ ci->missing += 1;
+ }
+ }
+ ci->leaf_cnt += 1;
+ return 0;
+}
+
+static int dbg_read_orphans(struct check_info *ci, struct ubifs_scan_leb *sleb)
+{
+ struct ubifs_scan_node *snod;
+ struct ubifs_orph_node *orph;
+ ino_t inum;
+ int i, n, err;
+
+ list_for_each_entry(snod, &sleb->nodes, list) {
+ cond_resched();
+ if (snod->type != UBIFS_ORPH_NODE)
+ continue;
+ orph = snod->node;
+ n = (le32_to_cpu(orph->ch.len) - UBIFS_ORPH_NODE_SZ) >> 3;
+ for (i = 0; i < n; i++) {
+ inum = le64_to_cpu(orph->inos[i]);
+ err = dbg_ins_check_orphan(&ci->root, inum);
+ if (err)
+ return err;
+ }
+ }
+ return 0;
+}
+
+static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci)
+{
+ int lnum, err = 0;
+
+ /* Check no-orphans flag and skip this if no orphans */
+ if (c->no_orphs)
+ return 0;
+
+ for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
+ struct ubifs_scan_leb *sleb;
+
+ sleb = ubifs_scan(c, lnum, 0, c->dbg_buf);
+ if (IS_ERR(sleb)) {
+ err = PTR_ERR(sleb);
+ break;
+ }
+
+ err = dbg_read_orphans(ci, sleb);
+ ubifs_scan_destroy(sleb);
+ if (err)
+ break;
+ }
+
+ return err;
+}
+
+static int dbg_check_orphans(struct ubifs_info *c)
+{
+ struct check_info ci;
+ int err;
+
+ if (!(ubifs_chk_flags & UBIFS_CHK_ORPH))
+ return 0;
+
+ ci.last_ino = 0;
+ ci.tot_inos = 0;
+ ci.missing = 0;
+ ci.leaf_cnt = 0;
+ ci.root = RB_ROOT;
+ ci.node = kmalloc(UBIFS_MAX_INO_NODE_SZ, GFP_NOFS);
+ if (!ci.node) {
+ ubifs_err("out of memory");
+ return -ENOMEM;
+ }
+
+ err = dbg_scan_orphans(c, &ci);
+ if (err)
+ goto out;
+
+ err = dbg_walk_index(c, &dbg_orphan_check, NULL, &ci);
+ if (err) {
+ ubifs_err("cannot scan TNC, error %d", err);
+ goto out;
+ }
+
+ if (ci.missing) {
+ ubifs_err("%lu missing orphan(s)", ci.missing);
+ err = -EINVAL;
+ goto out;
+ }
+
+ dbg_cmt("last inode number is %lu", ci.last_ino);
+ dbg_cmt("total number of inodes is %lu", ci.tot_inos);
+ dbg_cmt("total number of leaf nodes is %llu", ci.leaf_cnt);
+
+out:
+ dbg_free_check_tree(&ci.root);
+ kfree(ci.node);
+ return err;
+}
+
+#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
new file mode 100644
index 00000000000..77d26c141cf
--- /dev/null
+++ b/fs/ubifs/recovery.c
@@ -0,0 +1,1519 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ * Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements functions needed to recover from unclean un-mounts.
+ * When UBIFS is mounted, it checks a flag on the master node to determine if
+ * an un-mount was completed sucessfully. If not, the process of mounting
+ * incorparates additional checking and fixing of on-flash data structures.
+ * UBIFS always cleans away all remnants of an unclean un-mount, so that
+ * errors do not accumulate. However UBIFS defers recovery if it is mounted
+ * read-only, and the flash is not modified in that case.
+ */
+
+#include <linux/crc32.h>
+#include "ubifs.h"
+
+/**
+ * is_empty - determine whether a buffer is empty (contains all 0xff).
+ * @buf: buffer to clean
+ * @len: length of buffer
+ *
+ * This function returns %1 if the buffer is empty (contains all 0xff) otherwise
+ * %0 is returned.
+ */
+static int is_empty(void *buf, int len)
+{
+ uint8_t *p = buf;
+ int i;
+
+ for (i = 0; i < len; i++)
+ if (*p++ != 0xff)
+ return 0;
+ return 1;
+}
+
+/**
+ * get_master_node - get the last valid master node allowing for corruption.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number
+ * @pbuf: buffer containing the LEB read, is returned here
+ * @mst: master node, if found, is returned here
+ * @cor: corruption, if found, is returned here
+ *
+ * This function allocates a buffer, reads the LEB into it, and finds and
+ * returns the last valid master node allowing for one area of corruption.
+ * The corrupt area, if there is one, must be consistent with the assumption
+ * that it is the result of an unclean unmount while the master node was being
+ * written. Under those circumstances, it is valid to use the previously written
+ * master node.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int get_master_node(const struct ubifs_info *c, int lnum, void **pbuf,
+ struct ubifs_mst_node **mst, void **cor)
+{
+ const int sz = c->mst_node_alsz;
+ int err, offs, len;
+ void *sbuf, *buf;
+
+ sbuf = vmalloc(c->leb_size);
+ if (!sbuf)
+ return -ENOMEM;
+
+ err = ubi_read(c->ubi, lnum, sbuf, 0, c->leb_size);
+ if (err && err != -EBADMSG)
+ goto out_free;
+
+ /* Find the first position that is definitely not a node */
+ offs = 0;
+ buf = sbuf;
+ len = c->leb_size;
+ while (offs + UBIFS_MST_NODE_SZ <= c->leb_size) {
+ struct ubifs_ch *ch = buf;
+
+ if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC)
+ break;
+ offs += sz;
+ buf += sz;
+ len -= sz;
+ }
+ /* See if there was a valid master node before that */
+ if (offs) {
+ int ret;
+
+ offs -= sz;
+ buf -= sz;
+ len += sz;
+ ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
+ if (ret != SCANNED_A_NODE && offs) {
+ /* Could have been corruption so check one place back */
+ offs -= sz;
+ buf -= sz;
+ len += sz;
+ ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
+ if (ret != SCANNED_A_NODE)
+ /*
+ * We accept only one area of corruption because
+ * we are assuming that it was caused while
+ * trying to write a master node.
+ */
+ goto out_err;
+ }
+ if (ret == SCANNED_A_NODE) {
+ struct ubifs_ch *ch = buf;
+
+ if (ch->node_type != UBIFS_MST_NODE)
+ goto out_err;
+ dbg_rcvry("found a master node at %d:%d", lnum, offs);
+ *mst = buf;
+ offs += sz;
+ buf += sz;
+ len -= sz;
+ }
+ }
+ /* Check for corruption */
+ if (offs < c->leb_size) {
+ if (!is_empty(buf, min_t(int, len, sz))) {
+ *cor = buf;
+ dbg_rcvry("found corruption at %d:%d", lnum, offs);
+ }
+ offs += sz;
+ buf += sz;
+ len -= sz;
+ }
+ /* Check remaining empty space */
+ if (offs < c->leb_size)
+ if (!is_empty(buf, len))
+ goto out_err;
+ *pbuf = sbuf;
+ return 0;
+
+out_err:
+ err = -EINVAL;
+out_free:
+ vfree(sbuf);
+ *mst = NULL;
+ *cor = NULL;
+ return err;
+}
+
+/**
+ * write_rcvrd_mst_node - write recovered master node.
+ * @c: UBIFS file-system description object
+ * @mst: master node
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int write_rcvrd_mst_node(struct ubifs_info *c,
+ struct ubifs_mst_node *mst)
+{
+ int err = 0, lnum = UBIFS_MST_LNUM, sz = c->mst_node_alsz;
+ uint32_t save_flags;
+
+ dbg_rcvry("recovery");
+
+ save_flags = mst->flags;
+ mst->flags = cpu_to_le32(le32_to_cpu(mst->flags) | UBIFS_MST_RCVRY);
+
+ ubifs_prepare_node(c, mst, UBIFS_MST_NODE_SZ, 1);
+ err = ubi_leb_change(c->ubi, lnum, mst, sz, UBI_SHORTTERM);
+ if (err)
+ goto out;
+ err = ubi_leb_change(c->ubi, lnum + 1, mst, sz, UBI_SHORTTERM);
+ if (err)
+ goto out;
+out:
+ mst->flags = save_flags;
+ return err;
+}
+
+/**
+ * ubifs_recover_master_node - recover the master node.
+ * @c: UBIFS file-system description object
+ *
+ * This function recovers the master node from corruption that may occur due to
+ * an unclean unmount.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_recover_master_node(struct ubifs_info *c)
+{
+ void *buf1 = NULL, *buf2 = NULL, *cor1 = NULL, *cor2 = NULL;
+ struct ubifs_mst_node *mst1 = NULL, *mst2 = NULL, *mst;
+ const int sz = c->mst_node_alsz;
+ int err, offs1, offs2;
+
+ dbg_rcvry("recovery");
+
+ err = get_master_node(c, UBIFS_MST_LNUM, &buf1, &mst1, &cor1);
+ if (err)
+ goto out_free;
+
+ err = get_master_node(c, UBIFS_MST_LNUM + 1, &buf2, &mst2, &cor2);
+ if (err)
+ goto out_free;
+
+ if (mst1) {
+ offs1 = (void *)mst1 - buf1;
+ if ((le32_to_cpu(mst1->flags) & UBIFS_MST_RCVRY) &&
+ (offs1 == 0 && !cor1)) {
+ /*
+ * mst1 was written by recovery at offset 0 with no
+ * corruption.
+ */
+ dbg_rcvry("recovery recovery");
+ mst = mst1;
+ } else if (mst2) {
+ offs2 = (void *)mst2 - buf2;
+ if (offs1 == offs2) {
+ /* Same offset, so must be the same */
+ if (memcmp((void *)mst1 + UBIFS_CH_SZ,
+ (void *)mst2 + UBIFS_CH_SZ,
+ UBIFS_MST_NODE_SZ - UBIFS_CH_SZ))
+ goto out_err;
+ mst = mst1;
+ } else if (offs2 + sz == offs1) {
+ /* 1st LEB was written, 2nd was not */
+ if (cor1)
+ goto out_err;
+ mst = mst1;
+ } else if (offs1 == 0 && offs2 + sz >= c->leb_size) {
+ /* 1st LEB was unmapped and written, 2nd not */
+ if (cor1)
+ goto out_err;
+ mst = mst1;
+ } else
+ goto out_err;
+ } else {
+ /*
+ * 2nd LEB was unmapped and about to be written, so
+ * there must be only one master node in the first LEB
+ * and no corruption.
+ */
+ if (offs1 != 0 || cor1)
+ goto out_err;
+ mst = mst1;
+ }
+ } else {
+ if (!mst2)
+ goto out_err;
+ /*
+ * 1st LEB was unmapped and about to be written, so there must
+ * be no room left in 2nd LEB.
+ */
+ offs2 = (void *)mst2 - buf2;
+ if (offs2 + sz + sz <= c->leb_size)
+ goto out_err;
+ mst = mst2;
+ }
+
+ dbg_rcvry("recovered master node from LEB %d",
+ (mst == mst1 ? UBIFS_MST_LNUM : UBIFS_MST_LNUM + 1));
+
+ memcpy(c->mst_node, mst, UBIFS_MST_NODE_SZ);
+
+ if ((c->vfs_sb->s_flags & MS_RDONLY)) {
+ /* Read-only mode. Keep a copy for switching to rw mode */
+ c->rcvrd_mst_node = kmalloc(sz, GFP_KERNEL);
+ if (!c->rcvrd_mst_node) {
+ err = -ENOMEM;
+ goto out_free;
+ }
+ memcpy(c->rcvrd_mst_node, c->mst_node, UBIFS_MST_NODE_SZ);
+ } else {
+ /* Write the recovered master node */
+ c->max_sqnum = le64_to_cpu(mst->ch.sqnum) - 1;
+ err = write_rcvrd_mst_node(c, c->mst_node);
+ if (err)
+ goto out_free;
+ }
+
+ vfree(buf2);
+ vfree(buf1);
+
+ return 0;
+
+out_err:
+ err = -EINVAL;
+out_free:
+ ubifs_err("failed to recover master node");
+ if (mst1) {
+ dbg_err("dumping first master node");
+ dbg_dump_node(c, mst1);
+ }
+ if (mst2) {
+ dbg_err("dumping second master node");
+ dbg_dump_node(c, mst2);
+ }
+ vfree(buf2);
+ vfree(buf1);
+ return err;
+}
+
+/**
+ * ubifs_write_rcvrd_mst_node - write the recovered master node.
+ * @c: UBIFS file-system description object
+ *
+ * This function writes the master node that was recovered during mounting in
+ * read-only mode and must now be written because we are remounting rw.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_write_rcvrd_mst_node(struct ubifs_info *c)
+{
+ int err;
+
+ if (!c->rcvrd_mst_node)
+ return 0;
+ c->rcvrd_mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
+ c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
+ err = write_rcvrd_mst_node(c, c->rcvrd_mst_node);
+ if (err)
+ return err;
+ kfree(c->rcvrd_mst_node);
+ c->rcvrd_mst_node = NULL;
+ return 0;
+}
+
+/**
+ * is_last_write - determine if an offset was in the last write to a LEB.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to check
+ * @offs: offset to check
+ *
+ * This function returns %1 if @offs was in the last write to the LEB whose data
+ * is in @buf, otherwise %0 is returned. The determination is made by checking
+ * for subsequent empty space starting from the next min_io_size boundary (or a
+ * bit less than the common header size if min_io_size is one).
+ */
+static int is_last_write(const struct ubifs_info *c, void *buf, int offs)
+{
+ int empty_offs;
+ int check_len;
+ uint8_t *p;
+
+ if (c->min_io_size == 1) {
+ check_len = c->leb_size - offs;
+ p = buf + check_len;
+ for (; check_len > 0; check_len--)
+ if (*--p != 0xff)
+ break;
+ /*
+ * 'check_len' is the size of the corruption which cannot be
+ * more than the size of 1 node if it was caused by an unclean
+ * unmount.
+ */
+ if (check_len > UBIFS_MAX_NODE_SZ)
+ return 0;
+ return 1;
+ }
+
+ /*
+ * Round up to the next c->min_io_size boundary i.e. 'offs' is in the
+ * last wbuf written. After that should be empty space.
+ */
+ empty_offs = ALIGN(offs + 1, c->min_io_size);
+ check_len = c->leb_size - empty_offs;
+ p = buf + empty_offs - offs;
+
+ for (; check_len > 0; check_len--)
+ if (*p++ != 0xff)
+ return 0;
+ return 1;
+}
+
+/**
+ * clean_buf - clean the data from an LEB sitting in a buffer.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to clean
+ * @lnum: LEB number to clean
+ * @offs: offset from which to clean
+ * @len: length of buffer
+ *
+ * This function pads up to the next min_io_size boundary (if there is one) and
+ * sets empty space to all 0xff. @buf, @offs and @len are updated to the next
+ * min_io_size boundary (if there is one).
+ */
+static void clean_buf(const struct ubifs_info *c, void **buf, int lnum,
+ int *offs, int *len)
+{
+ int empty_offs, pad_len;
+
+ lnum = lnum;
+ dbg_rcvry("cleaning corruption at %d:%d", lnum, *offs);
+
+ if (c->min_io_size == 1) {
+ memset(*buf, 0xff, c->leb_size - *offs);
+ return;
+ }
+
+ ubifs_assert(!(*offs & 7));
+ empty_offs = ALIGN(*offs, c->min_io_size);
+ pad_len = empty_offs - *offs;
+ ubifs_pad(c, *buf, pad_len);
+ *offs += pad_len;
+ *buf += pad_len;
+ *len -= pad_len;
+ memset(*buf, 0xff, c->leb_size - empty_offs);
+}
+
+/**
+ * no_more_nodes - determine if there are no more nodes in a buffer.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to check
+ * @len: length of buffer
+ * @lnum: LEB number of the LEB from which @buf was read
+ * @offs: offset from which @buf was read
+ *
+ * This function scans @buf for more nodes and returns %0 is a node is found and
+ * %1 if no more nodes are found.
+ */
+static int no_more_nodes(const struct ubifs_info *c, void *buf, int len,
+ int lnum, int offs)
+{
+ int skip, next_offs = 0;
+
+ if (len > UBIFS_DATA_NODE_SZ) {
+ struct ubifs_ch *ch = buf;
+ int dlen = le32_to_cpu(ch->len);
+
+ if (ch->node_type == UBIFS_DATA_NODE && dlen >= UBIFS_CH_SZ &&
+ dlen <= UBIFS_MAX_DATA_NODE_SZ)
+ /* The corrupt node looks like a data node */
+ next_offs = ALIGN(offs + dlen, 8);
+ }
+
+ if (c->min_io_size == 1)
+ skip = 8;
+ else
+ skip = ALIGN(offs + 1, c->min_io_size) - offs;
+
+ offs += skip;
+ buf += skip;
+ len -= skip;
+ while (len > 8) {
+ struct ubifs_ch *ch = buf;
+ uint32_t magic = le32_to_cpu(ch->magic);
+ int ret;
+
+ if (magic == UBIFS_NODE_MAGIC) {
+ ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
+ if (ret == SCANNED_A_NODE || ret > 0) {
+ /*
+ * There is a small chance this is just data in
+ * a data node, so check that possibility. e.g.
+ * this is part of a file that itself contains
+ * a UBIFS image.
+ */
+ if (next_offs && offs + le32_to_cpu(ch->len) <=
+ next_offs)
+ continue;
+ dbg_rcvry("unexpected node at %d:%d", lnum,
+ offs);
+ return 0;
+ }
+ }
+ offs += 8;
+ buf += 8;
+ len -= 8;
+ }
+ return 1;
+}
+
+/**
+ * fix_unclean_leb - fix an unclean LEB.
+ * @c: UBIFS file-system description object
+ * @sleb: scanned LEB information
+ * @start: offset where scan started
+ */
+static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+ int start)
+{
+ int lnum = sleb->lnum, endpt = start;
+
+ /* Get the end offset of the last node we are keeping */
+ if (!list_empty(&sleb->nodes)) {
+ struct ubifs_scan_node *snod;
+
+ snod = list_entry(sleb->nodes.prev,
+ struct ubifs_scan_node, list);
+ endpt = snod->offs + snod->len;
+ }
+
+ if ((c->vfs_sb->s_flags & MS_RDONLY) && !c->remounting_rw) {
+ /* Add to recovery list */
+ struct ubifs_unclean_leb *ucleb;
+
+ dbg_rcvry("need to fix LEB %d start %d endpt %d",
+ lnum, start, sleb->endpt);
+ ucleb = kzalloc(sizeof(struct ubifs_unclean_leb), GFP_NOFS);
+ if (!ucleb)
+ return -ENOMEM;
+ ucleb->lnum = lnum;
+ ucleb->endpt = endpt;
+ list_add_tail(&ucleb->list, &c->unclean_leb_list);
+ } else {
+ /* Write the fixed LEB back to flash */
+ int err;
+
+ dbg_rcvry("fixing LEB %d start %d endpt %d",
+ lnum, start, sleb->endpt);
+ if (endpt == 0) {
+ err = ubifs_leb_unmap(c, lnum);
+ if (err)
+ return err;
+ } else {
+ int len = ALIGN(endpt, c->min_io_size);
+
+ if (start) {
+ err = ubi_read(c->ubi, lnum, sleb->buf, 0,
+ start);
+ if (err)
+ return err;
+ }
+ /* Pad to min_io_size */
+ if (len > endpt) {
+ int pad_len = len - ALIGN(endpt, 8);
+
+ if (pad_len > 0) {
+ void *buf = sleb->buf + len - pad_len;
+
+ ubifs_pad(c, buf, pad_len);
+ }
+ }
+ err = ubi_leb_change(c->ubi, lnum, sleb->buf, len,
+ UBI_UNKNOWN);
+ if (err)
+ return err;
+ }
+ }
+ return 0;
+}
+
+/**
+ * drop_incomplete_group - drop nodes from an incomplete group.
+ * @sleb: scanned LEB information
+ * @offs: offset of dropped nodes is returned here
+ *
+ * This function returns %1 if nodes are dropped and %0 otherwise.
+ */
+static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs)
+{
+ int dropped = 0;
+
+ while (!list_empty(&sleb->nodes)) {
+ struct ubifs_scan_node *snod;
+ struct ubifs_ch *ch;
+
+ snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node,
+ list);
+ ch = snod->node;
+ if (ch->group_type != UBIFS_IN_NODE_GROUP)
+ return dropped;
+ dbg_rcvry("dropping node at %d:%d", sleb->lnum, snod->offs);
+ *offs = snod->offs;
+ list_del(&snod->list);
+ kfree(snod);
+ sleb->nodes_cnt -= 1;
+ dropped = 1;
+ }
+ return dropped;
+}
+
+/**
+ * ubifs_recover_leb - scan and recover a LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number
+ * @offs: offset
+ * @sbuf: LEB-sized buffer to use
+ * @grouped: nodes may be grouped for recovery
+ *
+ * This function does a scan of a LEB, but caters for errors that might have
+ * been caused by the unclean unmount from which we are attempting to recover.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
+ int offs, void *sbuf, int grouped)
+{
+ int err, len = c->leb_size - offs, need_clean = 0, quiet = 1;
+ int empty_chkd = 0, start = offs;
+ struct ubifs_scan_leb *sleb;
+ void *buf = sbuf + offs;
+
+ dbg_rcvry("%d:%d", lnum, offs);
+
+ sleb = ubifs_start_scan(c, lnum, offs, sbuf);
+ if (IS_ERR(sleb))
+ return sleb;
+
+ if (sleb->ecc)
+ need_clean = 1;
+
+ while (len >= 8) {
+ int ret;
+
+ dbg_scan("look at LEB %d:%d (%d bytes left)",
+ lnum, offs, len);
+
+ cond_resched();
+
+ /*
+ * Scan quietly until there is an error from which we cannot
+ * recover
+ */
+ ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet);
+
+ if (ret == SCANNED_A_NODE) {
+ /* A valid node, and not a padding node */
+ struct ubifs_ch *ch = buf;
+ int node_len;
+
+ err = ubifs_add_snod(c, sleb, buf, offs);
+ if (err)
+ goto error;
+ node_len = ALIGN(le32_to_cpu(ch->len), 8);
+ offs += node_len;
+ buf += node_len;
+ len -= node_len;
+ continue;
+ }
+
+ if (ret > 0) {
+ /* Padding bytes or a valid padding node */
+ offs += ret;
+ buf += ret;
+ len -= ret;
+ continue;
+ }
+
+ if (ret == SCANNED_EMPTY_SPACE) {
+ if (!is_empty(buf, len)) {
+ if (!is_last_write(c, buf, offs))
+ break;
+ clean_buf(c, &buf, lnum, &offs, &len);
+ need_clean = 1;
+ }
+ empty_chkd = 1;
+ break;
+ }
+
+ if (ret == SCANNED_GARBAGE || ret == SCANNED_A_BAD_PAD_NODE)
+ if (is_last_write(c, buf, offs)) {
+ clean_buf(c, &buf, lnum, &offs, &len);
+ need_clean = 1;
+ empty_chkd = 1;
+ break;
+ }
+
+ if (ret == SCANNED_A_CORRUPT_NODE)
+ if (no_more_nodes(c, buf, len, lnum, offs)) {
+ clean_buf(c, &buf, lnum, &offs, &len);
+ need_clean = 1;
+ empty_chkd = 1;
+ break;
+ }
+
+ if (quiet) {
+ /* Redo the last scan but noisily */
+ quiet = 0;
+ continue;
+ }
+
+ switch (ret) {
+ case SCANNED_GARBAGE:
+ dbg_err("garbage");
+ goto corrupted;
+ case SCANNED_A_CORRUPT_NODE:
+ case SCANNED_A_BAD_PAD_NODE:
+ dbg_err("bad node");
+ goto corrupted;
+ default:
+ dbg_err("unknown");
+ goto corrupted;
+ }
+ }
+
+ if (!empty_chkd && !is_empty(buf, len)) {
+ if (is_last_write(c, buf, offs)) {
+ clean_buf(c, &buf, lnum, &offs, &len);
+ need_clean = 1;
+ } else {
+ ubifs_err("corrupt empty space at LEB %d:%d",
+ lnum, offs);
+ goto corrupted;
+ }
+ }
+
+ /* Drop nodes from incomplete group */
+ if (grouped && drop_incomplete_group(sleb, &offs)) {
+ buf = sbuf + offs;
+ len = c->leb_size - offs;
+ clean_buf(c, &buf, lnum, &offs, &len);
+ need_clean = 1;
+ }
+
+ if (offs % c->min_io_size) {
+ clean_buf(c, &buf, lnum, &offs, &len);
+ need_clean = 1;
+ }
+
+ ubifs_end_scan(c, sleb, lnum, offs);
+
+ if (need_clean) {
+ err = fix_unclean_leb(c, sleb, start);
+ if (err)
+ goto error;
+ }
+
+ return sleb;
+
+corrupted:
+ ubifs_scanned_corruption(c, lnum, offs, buf);
+ err = -EUCLEAN;
+error:
+ ubifs_err("LEB %d scanning failed", lnum);
+ ubifs_scan_destroy(sleb);
+ return ERR_PTR(err);
+}
+
+/**
+ * get_cs_sqnum - get commit start sequence number.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number of commit start node
+ * @offs: offset of commit start node
+ * @cs_sqnum: commit start sequence number is returned here
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int get_cs_sqnum(struct ubifs_info *c, int lnum, int offs,
+ unsigned long long *cs_sqnum)
+{
+ struct ubifs_cs_node *cs_node = NULL;
+ int err, ret;
+
+ dbg_rcvry("at %d:%d", lnum, offs);
+ cs_node = kmalloc(UBIFS_CS_NODE_SZ, GFP_KERNEL);
+ if (!cs_node)
+ return -ENOMEM;
+ if (c->leb_size - offs < UBIFS_CS_NODE_SZ)
+ goto out_err;
+ err = ubi_read(c->ubi, lnum, (void *)cs_node, offs, UBIFS_CS_NODE_SZ);
+ if (err && err != -EBADMSG)
+ goto out_free;
+ ret = ubifs_scan_a_node(c, cs_node, UBIFS_CS_NODE_SZ, lnum, offs, 0);
+ if (ret != SCANNED_A_NODE) {
+ dbg_err("Not a valid node");
+ goto out_err;
+ }
+ if (cs_node->ch.node_type != UBIFS_CS_NODE) {
+ dbg_err("Node a CS node, type is %d", cs_node->ch.node_type);
+ goto out_err;
+ }
+ if (le64_to_cpu(cs_node->cmt_no) != c->cmt_no) {
+ dbg_err("CS node cmt_no %llu != current cmt_no %llu",
+ (unsigned long long)le64_to_cpu(cs_node->cmt_no),
+ c->cmt_no);
+ goto out_err;
+ }
+ *cs_sqnum = le64_to_cpu(cs_node->ch.sqnum);
+ dbg_rcvry("commit start sqnum %llu", *cs_sqnum);
+ kfree(cs_node);
+ return 0;
+
+out_err:
+ err = -EINVAL;
+out_free:
+ ubifs_err("failed to get CS sqnum");
+ kfree(cs_node);
+ return err;
+}
+
+/**
+ * ubifs_recover_log_leb - scan and recover a log LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number
+ * @offs: offset
+ * @sbuf: LEB-sized buffer to use
+ *
+ * This function does a scan of a LEB, but caters for errors that might have
+ * been caused by the unclean unmount from which we are attempting to recover.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
+ int offs, void *sbuf)
+{
+ struct ubifs_scan_leb *sleb;
+ int next_lnum;
+
+ dbg_rcvry("LEB %d", lnum);
+ next_lnum = lnum + 1;
+ if (next_lnum >= UBIFS_LOG_LNUM + c->log_lebs)
+ next_lnum = UBIFS_LOG_LNUM;
+ if (next_lnum != c->ltail_lnum) {
+ /*
+ * We can only recover at the end of the log, so check that the
+ * next log LEB is empty or out of date.
+ */
+ sleb = ubifs_scan(c, next_lnum, 0, sbuf);
+ if (IS_ERR(sleb))
+ return sleb;
+ if (sleb->nodes_cnt) {
+ struct ubifs_scan_node *snod;
+ unsigned long long cs_sqnum = c->cs_sqnum;
+
+ snod = list_entry(sleb->nodes.next,
+ struct ubifs_scan_node, list);
+ if (cs_sqnum == 0) {
+ int err;
+
+ err = get_cs_sqnum(c, lnum, offs, &cs_sqnum);
+ if (err) {
+ ubifs_scan_destroy(sleb);
+ return ERR_PTR(err);
+ }
+ }
+ if (snod->sqnum > cs_sqnum) {
+ ubifs_err("unrecoverable log corruption "
+ "in LEB %d", lnum);
+ ubifs_scan_destroy(sleb);
+ return ERR_PTR(-EUCLEAN);
+ }
+ }
+ ubifs_scan_destroy(sleb);
+ }
+ return ubifs_recover_leb(c, lnum, offs, sbuf, 0);
+}
+
+/**
+ * recover_head - recover a head.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number of head to recover
+ * @offs: offset of head to recover
+ * @sbuf: LEB-sized buffer to use
+ *
+ * This function ensures that there is no data on the flash at a head location.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int recover_head(const struct ubifs_info *c, int lnum, int offs,
+ void *sbuf)
+{
+ int len, err, need_clean = 0;
+
+ if (c->min_io_size > 1)
+ len = c->min_io_size;
+ else
+ len = 512;
+ if (offs + len > c->leb_size)
+ len = c->leb_size - offs;
+
+ if (!len)
+ return 0;
+
+ /* Read at the head location and check it is empty flash */
+ err = ubi_read(c->ubi, lnum, sbuf, offs, len);
+ if (err)
+ need_clean = 1;
+ else {
+ uint8_t *p = sbuf;
+
+ while (len--)
+ if (*p++ != 0xff) {
+ need_clean = 1;
+ break;
+ }
+ }
+
+ if (need_clean) {
+ dbg_rcvry("cleaning head at %d:%d", lnum, offs);
+ if (offs == 0)
+ return ubifs_leb_unmap(c, lnum);
+ err = ubi_read(c->ubi, lnum, sbuf, 0, offs);
+ if (err)
+ return err;
+ return ubi_leb_change(c->ubi, lnum, sbuf, offs, UBI_UNKNOWN);
+ }
+
+ return 0;
+}
+
+/**
+ * ubifs_recover_inl_heads - recover index and LPT heads.
+ * @c: UBIFS file-system description object
+ * @sbuf: LEB-sized buffer to use
+ *
+ * This function ensures that there is no data on the flash at the index and
+ * LPT head locations.
+ *
+ * This deals with the recovery of a half-completed journal commit. UBIFS is
+ * careful never to overwrite the last version of the index or the LPT. Because
+ * the index and LPT are wandering trees, data from a half-completed commit will
+ * not be referenced anywhere in UBIFS. The data will be either in LEBs that are
+ * assumed to be empty and will be unmapped anyway before use, or in the index
+ * and LPT heads.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf)
+{
+ int err;
+
+ ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY) || c->remounting_rw);
+
+ dbg_rcvry("checking index head at %d:%d", c->ihead_lnum, c->ihead_offs);
+ err = recover_head(c, c->ihead_lnum, c->ihead_offs, sbuf);
+ if (err)
+ return err;
+
+ dbg_rcvry("checking LPT head at %d:%d", c->nhead_lnum, c->nhead_offs);
+ err = recover_head(c, c->nhead_lnum, c->nhead_offs, sbuf);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+/**
+ * clean_an_unclean_leb - read and write a LEB to remove corruption.
+ * @c: UBIFS file-system description object
+ * @ucleb: unclean LEB information
+ * @sbuf: LEB-sized buffer to use
+ *
+ * This function reads a LEB up to a point pre-determined by the mount recovery,
+ * checks the nodes, and writes the result back to the flash, thereby cleaning
+ * off any following corruption, or non-fatal ECC errors.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int clean_an_unclean_leb(const struct ubifs_info *c,
+ struct ubifs_unclean_leb *ucleb, void *sbuf)
+{
+ int err, lnum = ucleb->lnum, offs = 0, len = ucleb->endpt, quiet = 1;
+ void *buf = sbuf;
+
+ dbg_rcvry("LEB %d len %d", lnum, len);
+
+ if (len == 0) {
+ /* Nothing to read, just unmap it */
+ err = ubifs_leb_unmap(c, lnum);
+ if (err)
+ return err;
+ return 0;
+ }
+
+ err = ubi_read(c->ubi, lnum, buf, offs, len);
+ if (err && err != -EBADMSG)
+ return err;
+
+ while (len >= 8) {
+ int ret;
+
+ cond_resched();
+
+ /* Scan quietly until there is an error */
+ ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet);
+
+ if (ret == SCANNED_A_NODE) {
+ /* A valid node, and not a padding node */
+ struct ubifs_ch *ch = buf;
+ int node_len;
+
+ node_len = ALIGN(le32_to_cpu(ch->len), 8);
+ offs += node_len;
+ buf += node_len;
+ len -= node_len;
+ continue;
+ }
+
+ if (ret > 0) {
+ /* Padding bytes or a valid padding node */
+ offs += ret;
+ buf += ret;
+ len -= ret;
+ continue;
+ }
+
+ if (ret == SCANNED_EMPTY_SPACE) {
+ ubifs_err("unexpected empty space at %d:%d",
+ lnum, offs);
+ return -EUCLEAN;
+ }
+
+ if (quiet) {
+ /* Redo the last scan but noisily */
+ quiet = 0;
+ continue;
+ }
+
+ ubifs_scanned_corruption(c, lnum, offs, buf);
+ return -EUCLEAN;
+ }
+
+ /* Pad to min_io_size */
+ len = ALIGN(ucleb->endpt, c->min_io_size);
+ if (len > ucleb->endpt) {
+ int pad_len = len - ALIGN(ucleb->endpt, 8);
+
+ if (pad_len > 0) {
+ buf = c->sbuf + len - pad_len;
+ ubifs_pad(c, buf, pad_len);
+ }
+ }
+
+ /* Write back the LEB atomically */
+ err = ubi_leb_change(c->ubi, lnum, sbuf, len, UBI_UNKNOWN);
+ if (err)
+ return err;
+
+ dbg_rcvry("cleaned LEB %d", lnum);
+
+ return 0;
+}
+
+/**
+ * ubifs_clean_lebs - clean LEBs recovered during read-only mount.
+ * @c: UBIFS file-system description object
+ * @sbuf: LEB-sized buffer to use
+ *
+ * This function cleans a LEB identified during recovery that needs to be
+ * written but was not because UBIFS was mounted read-only. This happens when
+ * remounting to read-write mode.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_clean_lebs(const struct ubifs_info *c, void *sbuf)
+{
+ dbg_rcvry("recovery");
+ while (!list_empty(&c->unclean_leb_list)) {
+ struct ubifs_unclean_leb *ucleb;
+ int err;
+
+ ucleb = list_entry(c->unclean_leb_list.next,
+ struct ubifs_unclean_leb, list);
+ err = clean_an_unclean_leb(c, ucleb, sbuf);
+ if (err)
+ return err;
+ list_del(&ucleb->list);
+ kfree(ucleb);
+ }
+ return 0;
+}
+
+/**
+ * ubifs_rcvry_gc_commit - recover the GC LEB number and run the commit.
+ * @c: UBIFS file-system description object
+ *
+ * Out-of-place garbage collection requires always one empty LEB with which to
+ * start garbage collection. The LEB number is recorded in c->gc_lnum and is
+ * written to the master node on unmounting. In the case of an unclean unmount
+ * the value of gc_lnum recorded in the master node is out of date and cannot
+ * be used. Instead, recovery must allocate an empty LEB for this purpose.
+ * However, there may not be enough empty space, in which case it must be
+ * possible to GC the dirtiest LEB into the GC head LEB.
+ *
+ * This function also runs the commit which causes the TNC updates from
+ * size-recovery and orphans to be written to the flash. That is important to
+ * ensure correct replay order for subsequent mounts.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_rcvry_gc_commit(struct ubifs_info *c)
+{
+ struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
+ struct ubifs_lprops lp;
+ int lnum, err;
+
+ c->gc_lnum = -1;
+ if (wbuf->lnum == -1) {
+ dbg_rcvry("no GC head LEB");
+ goto find_free;
+ }
+ /*
+ * See whether the used space in the dirtiest LEB fits in the GC head
+ * LEB.
+ */
+ if (wbuf->offs == c->leb_size) {
+ dbg_rcvry("no room in GC head LEB");
+ goto find_free;
+ }
+ err = ubifs_find_dirty_leb(c, &lp, wbuf->offs, 2);
+ if (err) {
+ if (err == -ENOSPC)
+ dbg_err("could not find a dirty LEB");
+ return err;
+ }
+ ubifs_assert(!(lp.flags & LPROPS_INDEX));
+ lnum = lp.lnum;
+ if (lp.free + lp.dirty == c->leb_size) {
+ /* An empty LEB was returned */
+ if (lp.free != c->leb_size) {
+ err = ubifs_change_one_lp(c, lnum, c->leb_size,
+ 0, 0, 0, 0);
+ if (err)
+ return err;
+ }
+ err = ubifs_leb_unmap(c, lnum);
+ if (err)
+ return err;
+ c->gc_lnum = lnum;
+ dbg_rcvry("allocated LEB %d for GC", lnum);
+ /* Run the commit */
+ dbg_rcvry("committing");
+ return ubifs_run_commit(c);
+ }
+ /*
+ * There was no empty LEB so the used space in the dirtiest LEB must fit
+ * in the GC head LEB.
+ */
+ if (lp.free + lp.dirty < wbuf->offs) {
+ dbg_rcvry("LEB %d doesn't fit in GC head LEB %d:%d",
+ lnum, wbuf->lnum, wbuf->offs);
+ err = ubifs_return_leb(c, lnum);
+ if (err)
+ return err;
+ goto find_free;
+ }
+ /*
+ * We run the commit before garbage collection otherwise subsequent
+ * mounts will see the GC and orphan deletion in a different order.
+ */
+ dbg_rcvry("committing");
+ err = ubifs_run_commit(c);
+ if (err)
+ return err;
+ /*
+ * The data in the dirtiest LEB fits in the GC head LEB, so do the GC
+ * - use locking to keep 'ubifs_assert()' happy.
+ */
+ dbg_rcvry("GC'ing LEB %d", lnum);
+ mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);
+ err = ubifs_garbage_collect_leb(c, &lp);
+ if (err >= 0) {
+ int err2 = ubifs_wbuf_sync_nolock(wbuf);
+
+ if (err2)
+ err = err2;
+ }
+ mutex_unlock(&wbuf->io_mutex);
+ if (err < 0) {
+ dbg_err("GC failed, error %d", err);
+ if (err == -EAGAIN)
+ err = -EINVAL;
+ return err;
+ }
+ if (err != LEB_RETAINED) {
+ dbg_err("GC returned %d", err);
+ return -EINVAL;
+ }
+ err = ubifs_leb_unmap(c, c->gc_lnum);
+ if (err)
+ return err;
+ dbg_rcvry("allocated LEB %d for GC", lnum);
+ return 0;
+
+find_free:
+ /*
+ * There is no GC head LEB or the free space in the GC head LEB is too
+ * small. Allocate gc_lnum by calling 'ubifs_find_free_leb_for_idx()' so
+ * GC is not run.
+ */
+ lnum = ubifs_find_free_leb_for_idx(c);
+ if (lnum < 0) {
+ dbg_err("could not find an empty LEB");
+ return lnum;
+ }
+ /* And reset the index flag */
+ err = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
+ LPROPS_INDEX, 0);
+ if (err)
+ return err;
+ c->gc_lnum = lnum;
+ dbg_rcvry("allocated LEB %d for GC", lnum);
+ /* Run the commit */
+ dbg_rcvry("committing");
+ return ubifs_run_commit(c);
+}
+
+/**
+ * struct size_entry - inode size information for recovery.
+ * @rb: link in the RB-tree of sizes
+ * @inum: inode number
+ * @i_size: size on inode
+ * @d_size: maximum size based on data nodes
+ * @exists: indicates whether the inode exists
+ * @inode: inode if pinned in memory awaiting rw mode to fix it
+ */
+struct size_entry {
+ struct rb_node rb;
+ ino_t inum;
+ loff_t i_size;
+ loff_t d_size;
+ int exists;
+ struct inode *inode;
+};
+
+/**
+ * add_ino - add an entry to the size tree.
+ * @c: UBIFS file-system description object
+ * @inum: inode number
+ * @i_size: size on inode
+ * @d_size: maximum size based on data nodes
+ * @exists: indicates whether the inode exists
+ */
+static int add_ino(struct ubifs_info *c, ino_t inum, loff_t i_size,
+ loff_t d_size, int exists)
+{
+ struct rb_node **p = &c->size_tree.rb_node, *parent = NULL;
+ struct size_entry *e;
+
+ while (*p) {
+ parent = *p;
+ e = rb_entry(parent, struct size_entry, rb);
+ if (inum < e->inum)
+ p = &(*p)->rb_left;
+ else
+ p = &(*p)->rb_right;
+ }
+
+ e = kzalloc(sizeof(struct size_entry), GFP_KERNEL);
+ if (!e)
+ return -ENOMEM;
+
+ e->inum = inum;
+ e->i_size = i_size;
+ e->d_size = d_size;
+ e->exists = exists;
+
+ rb_link_node(&e->rb, parent, p);
+ rb_insert_color(&e->rb, &c->size_tree);
+
+ return 0;
+}
+
+/**
+ * find_ino - find an entry on the size tree.
+ * @c: UBIFS file-system description object
+ * @inum: inode number
+ */
+static struct size_entry *find_ino(struct ubifs_info *c, ino_t inum)
+{
+ struct rb_node *p = c->size_tree.rb_node;
+ struct size_entry *e;
+
+ while (p) {
+ e = rb_entry(p, struct size_entry, rb);
+ if (inum < e->inum)
+ p = p->rb_left;
+ else if (inum > e->inum)
+ p = p->rb_right;
+ else
+ return e;
+ }
+ return NULL;
+}
+
+/**
+ * remove_ino - remove an entry from the size tree.
+ * @c: UBIFS file-system description object
+ * @inum: inode number
+ */
+static void remove_ino(struct ubifs_info *c, ino_t inum)
+{
+ struct size_entry *e = find_ino(c, inum);
+
+ if (!e)
+ return;
+ rb_erase(&e->rb, &c->size_tree);
+ kfree(e);
+}
+
+/**
+ * ubifs_destroy_size_tree - free resources related to the size tree.
+ * @c: UBIFS file-system description object
+ */
+void ubifs_destroy_size_tree(struct ubifs_info *c)
+{
+ struct rb_node *this = c->size_tree.rb_node;
+ struct size_entry *e;
+
+ while (this) {
+ if (this->rb_left) {
+ this = this->rb_left;
+ continue;
+ } else if (this->rb_right) {
+ this = this->rb_right;
+ continue;
+ }
+ e = rb_entry(this, struct size_entry, rb);
+ if (e->inode)
+ iput(e->inode);
+ this = rb_parent(this);
+ if (this) {
+ if (this->rb_left == &e->rb)
+ this->rb_left = NULL;
+ else
+ this->rb_right = NULL;
+ }
+ kfree(e);
+ }
+ c->size_tree = RB_ROOT;
+}
+
+/**
+ * ubifs_recover_size_accum - accumulate inode sizes for recovery.
+ * @c: UBIFS file-system description object
+ * @key: node key
+ * @deletion: node is for a deletion
+ * @new_size: inode size
+ *
+ * This function has two purposes:
+ * 1) to ensure there are no data nodes that fall outside the inode size
+ * 2) to ensure there are no data nodes for inodes that do not exist
+ * To accomplish those purposes, a rb-tree is constructed containing an entry
+ * for each inode number in the journal that has not been deleted, and recording
+ * the size from the inode node, the maximum size of any data node (also altered
+ * by truncations) and a flag indicating a inode number for which no inode node
+ * was present in the journal.
+ *
+ * Note that there is still the possibility that there are data nodes that have
+ * been committed that are beyond the inode size, however the only way to find
+ * them would be to scan the entire index. Alternatively, some provision could
+ * be made to record the size of inodes at the start of commit, which would seem
+ * very cumbersome for a scenario that is quite unlikely and the only negative
+ * consequence of which is wasted space.
+ *
+ * This functions returns %0 on success and a negative error code on failure.
+ */
+int ubifs_recover_size_accum(struct ubifs_info *c, union ubifs_key *key,
+ int deletion, loff_t new_size)
+{
+ ino_t inum = key_inum(c, key);
+ struct size_entry *e;
+ int err;
+
+ switch (key_type(c, key)) {
+ case UBIFS_INO_KEY:
+ if (deletion)
+ remove_ino(c, inum);
+ else {
+ e = find_ino(c, inum);
+ if (e) {
+ e->i_size = new_size;
+ e->exists = 1;
+ } else {
+ err = add_ino(c, inum, new_size, 0, 1);
+ if (err)
+ return err;
+ }
+ }
+ break;
+ case UBIFS_DATA_KEY:
+ e = find_ino(c, inum);
+ if (e) {
+ if (new_size > e->d_size)
+ e->d_size = new_size;
+ } else {
+ err = add_ino(c, inum, 0, new_size, 0);
+ if (err)
+ return err;
+ }
+ break;
+ case UBIFS_TRUN_KEY:
+ e = find_ino(c, inum);
+ if (e)
+ e->d_size = new_size;
+ break;
+ }
+ return 0;
+}
+
+/**
+ * fix_size_in_place - fix inode size in place on flash.
+ * @c: UBIFS file-system description object
+ * @e: inode size information for recovery
+ */
+static int fix_size_in_place(struct ubifs_info *c, struct size_entry *e)
+{
+ struct ubifs_ino_node *ino = c->sbuf;
+ unsigned char *p;
+ union ubifs_key key;
+ int err, lnum, offs, len;
+ loff_t i_size;
+ uint32_t crc;
+
+ /* Locate the inode node LEB number and offset */
+ ino_key_init(c, &key, e->inum);
+ err = ubifs_tnc_locate(c, &key, ino, &lnum, &offs);
+ if (err)
+ goto out;
+ /*
+ * If the size recorded on the inode node is greater than the size that
+ * was calculated from nodes in the journal then don't change the inode.
+ */
+ i_size = le64_to_cpu(ino->size);
+ if (i_size >= e->d_size)
+ return 0;
+ /* Read the LEB */
+ err = ubi_read(c->ubi, lnum, c->sbuf, 0, c->leb_size);
+ if (err)
+ goto out;
+ /* Change the size field and recalculate the CRC */
+ ino = c->sbuf + offs;
+ ino->size = cpu_to_le64(e->d_size);
+ len = le32_to_cpu(ino->ch.len);
+ crc = crc32(UBIFS_CRC32_INIT, (void *)ino + 8, len - 8);
+ ino->ch.crc = cpu_to_le32(crc);
+ /* Work out where data in the LEB ends and free space begins */
+ p = c->sbuf;
+ len = c->leb_size - 1;
+ while (p[len] == 0xff)
+ len -= 1;
+ len = ALIGN(len + 1, c->min_io_size);
+ /* Atomically write the fixed LEB back again */
+ err = ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN);
+ if (err)
+ goto out;
+ dbg_rcvry("inode %lu at %d:%d size %lld -> %lld ", e->inum, lnum, offs,
+ i_size, e->d_size);
+ return 0;
+
+out:
+ ubifs_warn("inode %lu failed to fix size %lld -> %lld error %d",
+ e->inum, e->i_size, e->d_size, err);
+ return err;
+}
+
+/**
+ * ubifs_recover_size - recover inode size.
+ * @c: UBIFS file-system description object
+ *
+ * This function attempts to fix inode size discrepancies identified by the
+ * 'ubifs_recover_size_accum()' function.
+ *
+ * This functions returns %0 on success and a negative error code on failure.
+ */
+int ubifs_recover_size(struct ubifs_info *c)
+{
+ struct rb_node *this = rb_first(&c->size_tree);
+
+ while (this) {
+ struct size_entry *e;
+ int err;
+
+ e = rb_entry(this, struct size_entry, rb);
+ if (!e->exists) {
+ union ubifs_key key;
+
+ ino_key_init(c, &key, e->inum);
+ err = ubifs_tnc_lookup(c, &key, c->sbuf);
+ if (err && err != -ENOENT)
+ return err;
+ if (err == -ENOENT) {
+ /* Remove data nodes that have no inode */
+ dbg_rcvry("removing ino %lu", e->inum);
+ err = ubifs_tnc_remove_ino(c, e->inum);
+ if (err)
+ return err;
+ } else {
+ struct ubifs_ino_node *ino = c->sbuf;
+
+ e->exists = 1;
+ e->i_size = le64_to_cpu(ino->size);
+ }
+ }
+ if (e->exists && e->i_size < e->d_size) {
+ if (!e->inode && (c->vfs_sb->s_flags & MS_RDONLY)) {
+ /* Fix the inode size and pin it in memory */
+ struct inode *inode;
+
+ inode = ubifs_iget(c->vfs_sb, e->inum);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+ if (inode->i_size < e->d_size) {
+ dbg_rcvry("ino %lu size %lld -> %lld",
+ e->inum, e->d_size,
+ inode->i_size);
+ inode->i_size = e->d_size;
+ ubifs_inode(inode)->ui_size = e->d_size;
+ e->inode = inode;
+ this = rb_next(this);
+ continue;
+ }
+ iput(inode);
+ } else {
+ /* Fix the size in place */
+ err = fix_size_in_place(c, e);
+ if (err)
+ return err;
+ if (e->inode)
+ iput(e->inode);
+ }
+ }
+ this = rb_next(this);
+ rb_erase(&e->rb, &c->size_tree);
+ kfree(e);
+ }
+ return 0;
+}
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
new file mode 100644
index 00000000000..7399692af85
--- /dev/null
+++ b/fs/ubifs/replay.c
@@ -0,0 +1,1075 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ * Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file contains journal replay code. It runs when the file-system is being
+ * mounted and requires no locking.
+ *
+ * The larger is the journal, the longer it takes to scan it, so the longer it
+ * takes to mount UBIFS. This is why the journal has limited size which may be
+ * changed depending on the system requirements. But a larger journal gives
+ * faster I/O speed because it writes the index less frequently. So this is a
+ * trade-off. Also, the journal is indexed by the in-memory index (TNC), so the
+ * larger is the journal, the more memory its index may consume.
+ */
+
+#include "ubifs.h"
+
+/*
+ * Replay flags.
+ *
+ * REPLAY_DELETION: node was deleted
+ * REPLAY_REF: node is a reference node
+ */
+enum {
+ REPLAY_DELETION = 1,
+ REPLAY_REF = 2,
+};
+
+/**
+ * struct replay_entry - replay tree entry.
+ * @lnum: logical eraseblock number of the node
+ * @offs: node offset
+ * @len: node length
+ * @sqnum: node sequence number
+ * @flags: replay flags
+ * @rb: links the replay tree
+ * @key: node key
+ * @nm: directory entry name
+ * @old_size: truncation old size
+ * @new_size: truncation new size
+ * @free: amount of free space in a bud
+ * @dirty: amount of dirty space in a bud from padding and deletion nodes
+ *
+ * UBIFS journal replay must compare node sequence numbers, which means it must
+ * build a tree of node information to insert into the TNC.
+ */
+struct replay_entry {
+ int lnum;
+ int offs;
+ int len;
+ unsigned long long sqnum;
+ int flags;
+ struct rb_node rb;
+ union ubifs_key key;
+ union {
+ struct qstr nm;
+ struct {
+ loff_t old_size;
+ loff_t new_size;
+ };
+ struct {
+ int free;
+ int dirty;
+ };
+ };
+};
+
+/**
+ * struct bud_entry - entry in the list of buds to replay.
+ * @list: next bud in the list
+ * @bud: bud description object
+ * @free: free bytes in the bud
+ * @sqnum: reference node sequence number
+ */
+struct bud_entry {
+ struct list_head list;
+ struct ubifs_bud *bud;
+ int free;
+ unsigned long long sqnum;
+};
+
+/**
+ * set_bud_lprops - set free and dirty space used by a bud.
+ * @c: UBIFS file-system description object
+ * @r: replay entry of bud
+ */
+static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
+{
+ const struct ubifs_lprops *lp;
+ int err = 0, dirty;
+
+ ubifs_get_lprops(c);
+
+ lp = ubifs_lpt_lookup_dirty(c, r->lnum);
+ if (IS_ERR(lp)) {
+ err = PTR_ERR(lp);
+ goto out;
+ }
+
+ dirty = lp->dirty;
+ if (r->offs == 0 && (lp->free != c->leb_size || lp->dirty != 0)) {
+ /*
+ * The LEB was added to the journal with a starting offset of
+ * zero which means the LEB must have been empty. The LEB
+ * property values should be lp->free == c->leb_size and
+ * lp->dirty == 0, but that is not the case. The reason is that
+ * the LEB was garbage collected. The garbage collector resets
+ * the free and dirty space without recording it anywhere except
+ * lprops, so if there is not a commit then lprops does not have
+ * that information next time the file system is mounted.
+ *
+ * We do not need to adjust free space because the scan has told
+ * us the exact value which is recorded in the replay entry as
+ * r->free.
+ *
+ * However we do need to subtract from the dirty space the
+ * amount of space that the garbage collector reclaimed, which
+ * is the whole LEB minus the amount of space that was free.
+ */
+ dbg_mnt("bud LEB %d was GC'd (%d free, %d dirty)", r->lnum,
+ lp->free, lp->dirty);
+ dbg_gc("bud LEB %d was GC'd (%d free, %d dirty)", r->lnum,
+ lp->free, lp->dirty);
+ dirty -= c->leb_size - lp->free;
+ /*
+ * If the replay order was perfect the dirty space would now be
+ * zero. The order is not perfect because the the journal heads
+ * race with eachother. This is not a problem but is does mean
+ * that the dirty space may temporarily exceed c->leb_size
+ * during the replay.
+ */
+ if (dirty != 0)
+ dbg_msg("LEB %d lp: %d free %d dirty "
+ "replay: %d free %d dirty", r->lnum, lp->free,
+ lp->dirty, r->free, r->dirty);
+ }
+ lp = ubifs_change_lp(c, lp, r->free, dirty + r->dirty,
+ lp->flags | LPROPS_TAKEN, 0);
+ if (IS_ERR(lp)) {
+ err = PTR_ERR(lp);
+ goto out;
+ }
+out:
+ ubifs_release_lprops(c);
+ return err;
+}
+
+/**
+ * trun_remove_range - apply a replay entry for a truncation to the TNC.
+ * @c: UBIFS file-system description object
+ * @r: replay entry of truncation
+ */
+static int trun_remove_range(struct ubifs_info *c, struct replay_entry *r)
+{
+ unsigned min_blk, max_blk;
+ union ubifs_key min_key, max_key;
+ ino_t ino;
+
+ min_blk = r->new_size / UBIFS_BLOCK_SIZE;
+ if (r->new_size & (UBIFS_BLOCK_SIZE - 1))
+ min_blk += 1;
+
+ max_blk = r->old_size / UBIFS_BLOCK_SIZE;
+ if ((r->old_size & (UBIFS_BLOCK_SIZE - 1)) == 0)
+ max_blk -= 1;
+
+ ino = key_inum(c, &r->key);
+
+ data_key_init(c, &min_key, ino, min_blk);
+ data_key_init(c, &max_key, ino, max_blk);
+
+ return ubifs_tnc_remove_range(c, &min_key, &max_key);
+}
+
+/**
+ * apply_replay_entry - apply a replay entry to the TNC.
+ * @c: UBIFS file-system description object
+ * @r: replay entry to apply
+ *
+ * Apply a replay entry to the TNC.
+ */
+static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
+{
+ int err, deletion = ((r->flags & REPLAY_DELETION) != 0);
+
+ dbg_mnt("LEB %d:%d len %d flgs %d sqnum %llu %s", r->lnum,
+ r->offs, r->len, r->flags, r->sqnum, DBGKEY(&r->key));
+
+ /* Set c->replay_sqnum to help deal with dangling branches. */
+ c->replay_sqnum = r->sqnum;
+
+ if (r->flags & REPLAY_REF)
+ err = set_bud_lprops(c, r);
+ else if (is_hash_key(c, &r->key)) {
+ if (deletion)
+ err = ubifs_tnc_remove_nm(c, &r->key, &r->nm);
+ else
+ err = ubifs_tnc_add_nm(c, &r->key, r->lnum, r->offs,
+ r->len, &r->nm);
+ } else {
+ if (deletion)
+ switch (key_type(c, &r->key)) {
+ case UBIFS_INO_KEY:
+ {
+ ino_t inum = key_inum(c, &r->key);
+
+ err = ubifs_tnc_remove_ino(c, inum);
+ break;
+ }
+ case UBIFS_TRUN_KEY:
+ err = trun_remove_range(c, r);
+ break;
+ default:
+ err = ubifs_tnc_remove(c, &r->key);
+ break;
+ }
+ else
+ err = ubifs_tnc_add(c, &r->key, r->lnum, r->offs,
+ r->len);
+ if (err)
+ return err;
+
+ if (c->need_recovery)
+ err = ubifs_recover_size_accum(c, &r->key, deletion,
+ r->new_size);
+ }
+
+ return err;
+}
+
+/**
+ * destroy_replay_tree - destroy the replay.
+ * @c: UBIFS file-system description object
+ *
+ * Destroy the replay tree.
+ */
+static void destroy_replay_tree(struct ubifs_info *c)
+{
+ struct rb_node *this = c->replay_tree.rb_node;
+ struct replay_entry *r;
+
+ while (this) {
+ if (this->rb_left) {
+ this = this->rb_left;
+ continue;
+ } else if (this->rb_right) {
+ this = this->rb_right;
+ continue;
+ }
+ r = rb_entry(this, struct replay_entry, rb);
+ this = rb_parent(this);
+ if (this) {
+ if (this->rb_left == &r->rb)
+ this->rb_left = NULL;
+ else
+ this->rb_right = NULL;
+ }
+ if (is_hash_key(c, &r->key))
+ kfree(r->nm.name);
+ kfree(r);
+ }
+ c->replay_tree = RB_ROOT;
+}
+
+/**
+ * apply_replay_tree - apply the replay tree to the TNC.
+ * @c: UBIFS file-system description object
+ *
+ * Apply the replay tree.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+static int apply_replay_tree(struct ubifs_info *c)
+{
+ struct rb_node *this = rb_first(&c->replay_tree);
+
+ while (this) {
+ struct replay_entry *r;
+ int err;
+
+ cond_resched();
+
+ r = rb_entry(this, struct replay_entry, rb);
+ err = apply_replay_entry(c, r);
+ if (err)
+ return err;
+ this = rb_next(this);
+ }
+ return 0;
+}
+
+/**
+ * insert_node - insert a node to the replay tree.
+ * @c: UBIFS file-system description object
+ * @lnum: node logical eraseblock number
+ * @offs: node offset
+ * @len: node length
+ * @key: node key
+ * @sqnum: sequence number
+ * @deletion: non-zero if this is a deletion
+ * @used: number of bytes in use in a LEB
+ * @old_size: truncation old size
+ * @new_size: truncation new size
+ *
+ * This function inserts a scanned non-direntry node to the replay tree. The
+ * replay tree is an RB-tree containing @struct replay_entry elements which are
+ * indexed by the sequence number. The replay tree is applied at the very end
+ * of the replay process. Since the tree is sorted in sequence number order,
+ * the older modifications are applied first. This function returns zero in
+ * case of success and a negative error code in case of failure.
+ */
+static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
+ union ubifs_key *key, unsigned long long sqnum,
+ int deletion, int *used, loff_t old_size,
+ loff_t new_size)
+{
+ struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
+ struct replay_entry *r;
+
+ if (key_inum(c, key) >= c->highest_inum)
+ c->highest_inum = key_inum(c, key);
+
+ dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
+ while (*p) {
+ parent = *p;
+ r = rb_entry(parent, struct replay_entry, rb);
+ if (sqnum < r->sqnum) {
+ p = &(*p)->rb_left;
+ continue;
+ } else if (sqnum > r->sqnum) {
+ p = &(*p)->rb_right;
+ continue;
+ }
+ ubifs_err("duplicate sqnum in replay");
+ return -EINVAL;
+ }
+
+ r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
+ if (!r)
+ return -ENOMEM;
+
+ if (!deletion)
+ *used += ALIGN(len, 8);
+ r->lnum = lnum;
+ r->offs = offs;
+ r->len = len;
+ r->sqnum = sqnum;
+ r->flags = (deletion ? REPLAY_DELETION : 0);
+ r->old_size = old_size;
+ r->new_size = new_size;
+ key_copy(c, key, &r->key);
+
+ rb_link_node(&r->rb, parent, p);
+ rb_insert_color(&r->rb, &c->replay_tree);
+ return 0;
+}
+
+/**
+ * insert_dent - insert a directory entry node into the replay tree.
+ * @c: UBIFS file-system description object
+ * @lnum: node logical eraseblock number
+ * @offs: node offset
+ * @len: node length
+ * @key: node key
+ * @name: directory entry name
+ * @nlen: directory entry name length
+ * @sqnum: sequence number
+ * @deletion: non-zero if this is a deletion
+ * @used: number of bytes in use in a LEB
+ *
+ * This function inserts a scanned directory entry node to the replay tree.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ *
+ * This function is also used for extended attribute entries because they are
+ * implemented as directory entry nodes.
+ */
+static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len,
+ union ubifs_key *key, const char *name, int nlen,
+ unsigned long long sqnum, int deletion, int *used)
+{
+ struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
+ struct replay_entry *r;
+ char *nbuf;
+
+ if (key_inum(c, key) >= c->highest_inum)
+ c->highest_inum = key_inum(c, key);
+
+ dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
+ while (*p) {
+ parent = *p;
+ r = rb_entry(parent, struct replay_entry, rb);
+ if (sqnum < r->sqnum) {
+ p = &(*p)->rb_left;
+ continue;
+ }
+ if (sqnum > r->sqnum) {
+ p = &(*p)->rb_right;
+ continue;
+ }
+ ubifs_err("duplicate sqnum in replay");
+ return -EINVAL;
+ }
+
+ r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
+ if (!r)
+ return -ENOMEM;
+ nbuf = kmalloc(nlen + 1, GFP_KERNEL);
+ if (!nbuf) {
+ kfree(r);
+ return -ENOMEM;
+ }
+
+ if (!deletion)
+ *used += ALIGN(len, 8);
+ r->lnum = lnum;
+ r->offs = offs;
+ r->len = len;
+ r->sqnum = sqnum;
+ r->nm.len = nlen;
+ memcpy(nbuf, name, nlen);
+ nbuf[nlen] = '\0';
+ r->nm.name = nbuf;
+ r->flags = (deletion ? REPLAY_DELETION : 0);
+ key_copy(c, key, &r->key);
+
+ ubifs_assert(!*p);
+ rb_link_node(&r->rb, parent, p);
+ rb_insert_color(&r->rb, &c->replay_tree);
+ return 0;
+}
+
+/**
+ * ubifs_validate_entry - validate directory or extended attribute entry node.
+ * @c: UBIFS file-system description object
+ * @dent: the node to validate
+ *
+ * This function validates directory or extended attribute entry node @dent.
+ * Returns zero if the node is all right and a %-EINVAL if not.
+ */
+int ubifs_validate_entry(struct ubifs_info *c,
+ const struct ubifs_dent_node *dent)
+{
+ int key_type = key_type_flash(c, dent->key);
+ int nlen = le16_to_cpu(dent->nlen);
+
+ if (le32_to_cpu(dent->ch.len) != nlen + UBIFS_DENT_NODE_SZ + 1 ||
+ dent->type >= UBIFS_ITYPES_CNT ||
+ nlen > UBIFS_MAX_NLEN || dent->name[nlen] != 0 ||
+ strnlen(dent->name, nlen) != nlen ||
+ le64_to_cpu(dent->inum) > MAX_INUM) {
+ ubifs_err("bad %s node", key_type == UBIFS_DENT_KEY ?
+ "directory entry" : "extended attribute entry");
+ return -EINVAL;
+ }
+
+ if (key_type != UBIFS_DENT_KEY && key_type != UBIFS_XENT_KEY) {
+ ubifs_err("bad key type %d", key_type);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/**
+ * replay_bud - replay a bud logical eraseblock.
+ * @c: UBIFS file-system description object
+ * @lnum: bud logical eraseblock number to replay
+ * @offs: bud start offset
+ * @jhead: journal head to which this bud belongs
+ * @free: amount of free space in the bud is returned here
+ * @dirty: amount of dirty space from padding and deletion nodes is returned
+ * here
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
+ int *free, int *dirty)
+{
+ int err = 0, used = 0;
+ struct ubifs_scan_leb *sleb;
+ struct ubifs_scan_node *snod;
+ struct ubifs_bud *bud;
+
+ dbg_mnt("replay bud LEB %d, head %d", lnum, jhead);
+ if (c->need_recovery)
+ sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, jhead != GCHD);
+ else
+ sleb = ubifs_scan(c, lnum, offs, c->sbuf);
+ if (IS_ERR(sleb))
+ return PTR_ERR(sleb);
+
+ /*
+ * The bud does not have to start from offset zero - the beginning of
+ * the 'lnum' LEB may contain previously committed data. One of the
+ * things we have to do in replay is to correctly update lprops with
+ * newer information about this LEB.
+ *
+ * At this point lprops thinks that this LEB has 'c->leb_size - offs'
+ * bytes of free space because it only contain information about
+ * committed data.
+ *
+ * But we know that real amount of free space is 'c->leb_size -
+ * sleb->endpt', and the space in the 'lnum' LEB between 'offs' and
+ * 'sleb->endpt' is used by bud data. We have to correctly calculate
+ * how much of these data are dirty and update lprops with this
+ * information.
+ *
+ * The dirt in that LEB region is comprised of padding nodes, deletion
+ * nodes, truncation nodes and nodes which are obsoleted by subsequent
+ * nodes in this LEB. So instead of calculating clean space, we
+ * calculate used space ('used' variable).
+ */
+
+ list_for_each_entry(snod, &sleb->nodes, list) {
+ int deletion = 0;
+
+ cond_resched();
+
+ if (snod->sqnum >= SQNUM_WATERMARK) {
+ ubifs_err("file system's life ended");
+ goto out_dump;
+ }
+
+ if (snod->sqnum > c->max_sqnum)
+ c->max_sqnum = snod->sqnum;
+
+ switch (snod->type) {
+ case UBIFS_INO_NODE:
+ {
+ struct ubifs_ino_node *ino = snod->node;
+ loff_t new_size = le64_to_cpu(ino->size);
+
+ if (le32_to_cpu(ino->nlink) == 0)
+ deletion = 1;
+ err = insert_node(c, lnum, snod->offs, snod->len,
+ &snod->key, snod->sqnum, deletion,
+ &used, 0, new_size);
+ break;
+ }
+ case UBIFS_DATA_NODE:
+ {
+ struct ubifs_data_node *dn = snod->node;
+ loff_t new_size = le32_to_cpu(dn->size) +
+ key_block(c, &snod->key) *
+ UBIFS_BLOCK_SIZE;
+
+ err = insert_node(c, lnum, snod->offs, snod->len,
+ &snod->key, snod->sqnum, deletion,
+ &used, 0, new_size);
+ break;
+ }
+ case UBIFS_DENT_NODE:
+ case UBIFS_XENT_NODE:
+ {
+ struct ubifs_dent_node *dent = snod->node;
+
+ err = ubifs_validate_entry(c, dent);
+ if (err)
+ goto out_dump;
+
+ err = insert_dent(c, lnum, snod->offs, snod->len,
+ &snod->key, dent->name,
+ le16_to_cpu(dent->nlen), snod->sqnum,
+ !le64_to_cpu(dent->inum), &used);
+ break;
+ }
+ case UBIFS_TRUN_NODE:
+ {
+ struct ubifs_trun_node *trun = snod->node;
+ loff_t old_size = le64_to_cpu(trun->old_size);
+ loff_t new_size = le64_to_cpu(trun->new_size);
+ union ubifs_key key;
+
+ /* Validate truncation node */
+ if (old_size < 0 || old_size > c->max_inode_sz ||
+ new_size < 0 || new_size > c->max_inode_sz ||
+ old_size <= new_size) {
+ ubifs_err("bad truncation node");
+ goto out_dump;
+ }
+
+ /*
+ * Create a fake truncation key just to use the same
+ * functions which expect nodes to have keys.
+ */
+ trun_key_init(c, &key, le32_to_cpu(trun->inum));
+ err = insert_node(c, lnum, snod->offs, snod->len,
+ &key, snod->sqnum, 1, &used,
+ old_size, new_size);
+ break;
+ }
+ default:
+ ubifs_err("unexpected node type %d in bud LEB %d:%d",
+ snod->type, lnum, snod->offs);
+ err = -EINVAL;
+ goto out_dump;
+ }
+ if (err)
+ goto out;
+ }
+
+ bud = ubifs_search_bud(c, lnum);
+ if (!bud)
+ BUG();
+
+ ubifs_assert(sleb->endpt - offs >= used);
+ ubifs_assert(sleb->endpt % c->min_io_size == 0);
+
+ if (sleb->endpt + c->min_io_size <= c->leb_size &&
+ !(c->vfs_sb->s_flags & MS_RDONLY))
+ err = ubifs_wbuf_seek_nolock(&c->jheads[jhead].wbuf, lnum,
+ sleb->endpt, UBI_SHORTTERM);
+
+ *dirty = sleb->endpt - offs - used;
+ *free = c->leb_size - sleb->endpt;
+
+out:
+ ubifs_scan_destroy(sleb);
+ return err;
+
+out_dump:
+ ubifs_err("bad node is at LEB %d:%d", lnum, snod->offs);
+ dbg_dump_node(c, snod->node);
+ ubifs_scan_destroy(sleb);
+ return -EINVAL;
+}
+
+/**
+ * insert_ref_node - insert a reference node to the replay tree.
+ * @c: UBIFS file-system description object
+ * @lnum: node logical eraseblock number
+ * @offs: node offset
+ * @sqnum: sequence number
+ * @free: amount of free space in bud
+ * @dirty: amount of dirty space from padding and deletion nodes
+ *
+ * This function inserts a reference node to the replay tree and returns zero
+ * in case of success ort a negative error code in case of failure.
+ */
+static int insert_ref_node(struct ubifs_info *c, int lnum, int offs,
+ unsigned long long sqnum, int free, int dirty)
+{
+ struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
+ struct replay_entry *r;
+
+ dbg_mnt("add ref LEB %d:%d", lnum, offs);
+ while (*p) {
+ parent = *p;
+ r = rb_entry(parent, struct replay_entry, rb);
+ if (sqnum < r->sqnum) {
+ p = &(*p)->rb_left;
+ continue;
+ } else if (sqnum > r->sqnum) {
+ p = &(*p)->rb_right;
+ continue;
+ }
+ ubifs_err("duplicate sqnum in replay tree");
+ return -EINVAL;
+ }
+
+ r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
+ if (!r)
+ return -ENOMEM;
+
+ r->lnum = lnum;
+ r->offs = offs;
+ r->sqnum = sqnum;
+ r->flags = REPLAY_REF;
+ r->free = free;
+ r->dirty = dirty;
+
+ rb_link_node(&r->rb, parent, p);
+ rb_insert_color(&r->rb, &c->replay_tree);
+ return 0;
+}
+
+/**
+ * replay_buds - replay all buds.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int replay_buds(struct ubifs_info *c)
+{
+ struct bud_entry *b;
+ int err, uninitialized_var(free), uninitialized_var(dirty);
+
+ list_for_each_entry(b, &c->replay_buds, list) {
+ err = replay_bud(c, b->bud->lnum, b->bud->start, b->bud->jhead,
+ &free, &dirty);
+ if (err)
+ return err;
+ err = insert_ref_node(c, b->bud->lnum, b->bud->start, b->sqnum,
+ free, dirty);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+/**
+ * destroy_bud_list - destroy the list of buds to replay.
+ * @c: UBIFS file-system description object
+ */
+static void destroy_bud_list(struct ubifs_info *c)
+{
+ struct bud_entry *b;
+
+ while (!list_empty(&c->replay_buds)) {
+ b = list_entry(c->replay_buds.next, struct bud_entry, list);
+ list_del(&b->list);
+ kfree(b);
+ }
+}
+
+/**
+ * add_replay_bud - add a bud to the list of buds to replay.
+ * @c: UBIFS file-system description object
+ * @lnum: bud logical eraseblock number to replay
+ * @offs: bud start offset
+ * @jhead: journal head to which this bud belongs
+ * @sqnum: reference node sequence number
+ *
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int add_replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
+ unsigned long long sqnum)
+{
+ struct ubifs_bud *bud;
+ struct bud_entry *b;
+
+ dbg_mnt("add replay bud LEB %d:%d, head %d", lnum, offs, jhead);
+
+ bud = kmalloc(sizeof(struct ubifs_bud), GFP_KERNEL);
+ if (!bud)
+ return -ENOMEM;
+
+ b = kmalloc(sizeof(struct bud_entry), GFP_KERNEL);
+ if (!b) {
+ kfree(bud);
+ return -ENOMEM;
+ }
+
+ bud->lnum = lnum;
+ bud->start = offs;
+ bud->jhead = jhead;
+ ubifs_add_bud(c, bud);
+
+ b->bud = bud;
+ b->sqnum = sqnum;
+ list_add_tail(&b->list, &c->replay_buds);
+
+ return 0;
+}
+
+/**
+ * validate_ref - validate a reference node.
+ * @c: UBIFS file-system description object
+ * @ref: the reference node to validate
+ * @ref_lnum: LEB number of the reference node
+ * @ref_offs: reference node offset
+ *
+ * This function returns %1 if a bud reference already exists for the LEB. %0 is
+ * returned if the reference node is new, otherwise %-EINVAL is returned if
+ * validation failed.
+ */
+static int validate_ref(struct ubifs_info *c, const struct ubifs_ref_node *ref)
+{
+ struct ubifs_bud *bud;
+ int lnum = le32_to_cpu(ref->lnum);
+ unsigned int offs = le32_to_cpu(ref->offs);
+ unsigned int jhead = le32_to_cpu(ref->jhead);
+
+ /*
+ * ref->offs may point to the end of LEB when the journal head points
+ * to the end of LEB and we write reference node for it during commit.
+ * So this is why we require 'offs > c->leb_size'.
+ */
+ if (jhead >= c->jhead_cnt || lnum >= c->leb_cnt ||
+ lnum < c->main_first || offs > c->leb_size ||
+ offs & (c->min_io_size - 1))
+ return -EINVAL;
+
+ /* Make sure we have not already looked at this bud */
+ bud = ubifs_search_bud(c, lnum);
+ if (bud) {
+ if (bud->jhead == jhead && bud->start <= offs)
+ return 1;
+ ubifs_err("bud at LEB %d:%d was already referred", lnum, offs);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/**
+ * replay_log_leb - replay a log logical eraseblock.
+ * @c: UBIFS file-system description object
+ * @lnum: log logical eraseblock to replay
+ * @offs: offset to start replaying from
+ * @sbuf: scan buffer
+ *
+ * This function replays a log LEB and returns zero in case of success, %1 if
+ * this is the last LEB in the log, and a negative error code in case of
+ * failure.
+ */
+static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
+{
+ int err;
+ struct ubifs_scan_leb *sleb;
+ struct ubifs_scan_node *snod;
+ const struct ubifs_cs_node *node;
+
+ dbg_mnt("replay log LEB %d:%d", lnum, offs);
+ sleb = ubifs_scan(c, lnum, offs, sbuf);
+ if (IS_ERR(sleb)) {
+ if (c->need_recovery)
+ sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf);
+ if (IS_ERR(sleb))
+ return PTR_ERR(sleb);
+ }
+
+ if (sleb->nodes_cnt == 0) {
+ err = 1;
+ goto out;
+ }
+
+ node = sleb->buf;
+
+ snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list);
+ if (c->cs_sqnum == 0) {
+ /*
+ * This is the first log LEB we are looking at, make sure that
+ * the first node is a commit start node. Also record its
+ * sequence number so that UBIFS can determine where the log
+ * ends, because all nodes which were have higher sequence
+ * numbers.
+ */
+ if (snod->type != UBIFS_CS_NODE) {
+ dbg_err("first log node at LEB %d:%d is not CS node",
+ lnum, offs);
+ goto out_dump;
+ }
+ if (le64_to_cpu(node->cmt_no) != c->cmt_no) {
+ dbg_err("first CS node at LEB %d:%d has wrong "
+ "commit number %llu expected %llu",
+ lnum, offs,
+ (unsigned long long)le64_to_cpu(node->cmt_no),
+ c->cmt_no);
+ goto out_dump;
+ }
+
+ c->cs_sqnum = le64_to_cpu(node->ch.sqnum);
+ dbg_mnt("commit start sqnum %llu", c->cs_sqnum);
+ }
+
+ if (snod->sqnum < c->cs_sqnum) {
+ /*
+ * This means that we reached end of log and now
+ * look to the older log data, which was already
+ * committed but the eraseblock was not erased (UBIFS
+ * only unmaps it). So this basically means we have to
+ * exit with "end of log" code.
+ */
+ err = 1;
+ goto out;
+ }
+
+ /* Make sure the first node sits at offset zero of the LEB */
+ if (snod->offs != 0) {
+ dbg_err("first node is not at zero offset");
+ goto out_dump;
+ }
+
+ list_for_each_entry(snod, &sleb->nodes, list) {
+
+ cond_resched();
+
+ if (snod->sqnum >= SQNUM_WATERMARK) {
+ ubifs_err("file system's life ended");
+ goto out_dump;
+ }
+
+ if (snod->sqnum < c->cs_sqnum) {
+ dbg_err("bad sqnum %llu, commit sqnum %llu",
+ snod->sqnum, c->cs_sqnum);
+ goto out_dump;
+ }
+
+ if (snod->sqnum > c->max_sqnum)
+ c->max_sqnum = snod->sqnum;
+
+ switch (snod->type) {
+ case UBIFS_REF_NODE: {
+ const struct ubifs_ref_node *ref = snod->node;
+
+ err = validate_ref(c, ref);
+ if (err == 1)
+ break; /* Already have this bud */
+ if (err)
+ goto out_dump;
+
+ err = add_replay_bud(c, le32_to_cpu(ref->lnum),
+ le32_to_cpu(ref->offs),
+ le32_to_cpu(ref->jhead),
+ snod->sqnum);
+ if (err)
+ goto out;
+
+ break;
+ }
+ case UBIFS_CS_NODE:
+ /* Make sure it sits at the beginning of LEB */
+ if (snod->offs != 0) {
+ ubifs_err("unexpected node in log");
+ goto out_dump;
+ }
+ break;
+ default:
+ ubifs_err("unexpected node in log");
+ goto out_dump;
+ }
+ }
+
+ if (sleb->endpt || c->lhead_offs >= c->leb_size) {
+ c->lhead_lnum = lnum;
+ c->lhead_offs = sleb->endpt;
+ }
+
+ err = !sleb->endpt;
+out:
+ ubifs_scan_destroy(sleb);
+ return err;
+
+out_dump:
+ ubifs_err("log error detected while replying the log at LEB %d:%d",
+ lnum, offs + snod->offs);
+ dbg_dump_node(c, snod->node);
+ ubifs_scan_destroy(sleb);
+ return -EINVAL;
+}
+
+/**
+ * take_ihead - update the status of the index head in lprops to 'taken'.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns the amount of free space in the index head LEB or a
+ * negative error code.
+ */
+static int take_ihead(struct ubifs_info *c)
+{
+ const struct ubifs_lprops *lp;
+ int err, free;
+
+ ubifs_get_lprops(c);
+
+ lp = ubifs_lpt_lookup_dirty(c, c->ihead_lnum);
+ if (IS_ERR(lp)) {
+ err = PTR_ERR(lp);
+ goto out;
+ }
+
+ free = lp->free;
+
+ lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC,
+ lp->flags | LPROPS_TAKEN, 0);
+ if (IS_ERR(lp)) {
+ err = PTR_ERR(lp);
+ goto out;
+ }
+
+ err = free;
+out:
+ ubifs_release_lprops(c);
+ return err;
+}
+
+/**
+ * ubifs_replay_journal - replay journal.
+ * @c: UBIFS file-system description object
+ *
+ * This function scans the journal, replays and cleans it up. It makes sure all
+ * memory data structures related to uncommitted journal are built (dirty TNC
+ * tree, tree of buds, modified lprops, etc).
+ */
+int ubifs_replay_journal(struct ubifs_info *c)
+{
+ int err, i, lnum, offs, free;
+ void *sbuf = NULL;
+
+ BUILD_BUG_ON(UBIFS_TRUN_KEY > 5);
+
+ /* Update the status of the index head in lprops to 'taken' */
+ free = take_ihead(c);
+ if (free < 0)
+ return free; /* Error code */
+
+ if (c->ihead_offs != c->leb_size - free) {
+ ubifs_err("bad index head LEB %d:%d", c->ihead_lnum,
+ c->ihead_offs);
+ return -EINVAL;
+ }
+
+ sbuf = vmalloc(c->leb_size);
+ if (!sbuf)
+ return -ENOMEM;
+
+ dbg_mnt("start replaying the journal");
+
+ c->replaying = 1;
+
+ lnum = c->ltail_lnum = c->lhead_lnum;
+ offs = c->lhead_offs;
+
+ for (i = 0; i < c->log_lebs; i++, lnum++) {
+ if (lnum >= UBIFS_LOG_LNUM + c->log_lebs) {
+ /*
+ * The log is logically circular, we reached the last
+ * LEB, switch to the first one.
+ */
+ lnum = UBIFS_LOG_LNUM;
+ offs = 0;
+ }
+ err = replay_log_leb(c, lnum, offs, sbuf);
+ if (err == 1)
+ /* We hit the end of the log */
+ break;
+ if (err)
+ goto out;
+ offs = 0;
+ }
+
+ err = replay_buds(c);
+ if (err)
+ goto out;
+
+ err = apply_replay_tree(c);
+ if (err)
+ goto out;
+
+ ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery);
+ dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, "
+ "highest_inum %lu", c->lhead_lnum, c->lhead_offs, c->max_sqnum,
+ c->highest_inum);
+out:
+ destroy_replay_tree(c);
+ destroy_bud_list(c);
+ vfree(sbuf);
+ c->replaying = 0;
+ return err;
+}
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
new file mode 100644
index 00000000000..2bf753b3888
--- /dev/null
+++ b/fs/ubifs/sb.c
@@ -0,0 +1,629 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ * Adrian Hunter
+ */
+
+/*
+ * This file implements UBIFS superblock. The superblock is stored at the first
+ * LEB of the volume and is never changed by UBIFS. Only user-space tools may
+ * change it. The superblock node mostly contains geometry information.
+ */
+
+#include "ubifs.h"
+#include <linux/random.h>
+
+/*
+ * Default journal size in logical eraseblocks as a percent of total
+ * flash size.
+ */
+#define DEFAULT_JNL_PERCENT 5
+
+/* Default maximum journal size in bytes */
+#define DEFAULT_MAX_JNL (32*1024*1024)
+
+/* Default indexing tree fanout */
+#define DEFAULT_FANOUT 8
+
+/* Default number of data journal heads */
+#define DEFAULT_JHEADS_CNT 1
+
+/* Default positions of different LEBs in the main area */
+#define DEFAULT_IDX_LEB 0
+#define DEFAULT_DATA_LEB 1
+#define DEFAULT_GC_LEB 2
+
+/* Default number of LEB numbers in LPT's save table */
+#define DEFAULT_LSAVE_CNT 256
+
+/* Default reserved pool size as a percent of maximum free space */
+#define DEFAULT_RP_PERCENT 5
+
+/* The default maximum size of reserved pool in bytes */
+#define DEFAULT_MAX_RP_SIZE (5*1024*1024)
+
+/* Default time granularity in nanoseconds */
+#define DEFAULT_TIME_GRAN 1000000000
+
+/**
+ * create_default_filesystem - format empty UBI volume.
+ * @c: UBIFS file-system description object
+ *
+ * This function creates default empty file-system. Returns zero in case of
+ * success and a negative error code in case of failure.
+ */
+static int create_default_filesystem(struct ubifs_info *c)
+{
+ struct ubifs_sb_node *sup;
+ struct ubifs_mst_node *mst;
+ struct ubifs_idx_node *idx;
+ struct ubifs_branch *br;
+ struct ubifs_ino_node *ino;
+ struct ubifs_cs_node *cs;
+ union ubifs_key key;
+ int err, tmp, jnl_lebs, log_lebs, max_buds, main_lebs, main_first;
+ int lpt_lebs, lpt_first, orph_lebs, big_lpt, ino_waste, sup_flags = 0;
+ int min_leb_cnt = UBIFS_MIN_LEB_CNT;
+ uint64_t tmp64, main_bytes;
+
+ /* Some functions called from here depend on the @c->key_len filed */
+ c->key_len = UBIFS_SK_LEN;
+
+ /*
+ * First of all, we have to calculate default file-system geometry -
+ * log size, journal size, etc.
+ */
+ if (c->leb_cnt < 0x7FFFFFFF / DEFAULT_JNL_PERCENT)
+ /* We can first multiply then divide and have no overflow */
+ jnl_lebs = c->leb_cnt * DEFAULT_JNL_PERCENT / 100;
+ else
+ jnl_lebs = (c->leb_cnt / 100) * DEFAULT_JNL_PERCENT;
+
+ if (jnl_lebs < UBIFS_MIN_JNL_LEBS)
+ jnl_lebs = UBIFS_MIN_JNL_LEBS;
+ if (jnl_lebs * c->leb_size > DEFAULT_MAX_JNL)
+ jnl_lebs = DEFAULT_MAX_JNL / c->leb_size;
+
+ /*
+ * The log should be large enough to fit reference nodes for all bud
+ * LEBs. Because buds do not have to start from the beginning of LEBs
+ * (half of the LEB may contain committed data), the log should
+ * generally be larger, make it twice as large.
+ */
+ tmp = 2 * (c->ref_node_alsz * jnl_lebs) + c->leb_size - 1;
+ log_lebs = tmp / c->leb_size;
+ /* Plus one LEB reserved for commit */
+ log_lebs += 1;
+ if (c->leb_cnt - min_leb_cnt > 8) {
+ /* And some extra space to allow writes while committing */
+ log_lebs += 1;
+ min_leb_cnt += 1;
+ }
+
+ max_buds = jnl_lebs - log_lebs;
+ if (max_buds < UBIFS_MIN_BUD_LEBS)
+ max_buds = UBIFS_MIN_BUD_LEBS;
+
+ /*
+ * Orphan nodes are stored in a separate area. One node can store a lot
+ * of orphan inode numbers, but when new orphan comes we just add a new
+ * orphan node. At some point the nodes are consolidated into one
+ * orphan node.
+ */
+ orph_lebs = UBIFS_MIN_ORPH_LEBS;
+#ifdef CONFIG_UBIFS_FS_DEBUG
+ if (c->leb_cnt - min_leb_cnt > 1)
+ /*
+ * For debugging purposes it is better to have at least 2
+ * orphan LEBs, because the orphan subsystem would need to do
+ * consolidations and would be stressed more.
+ */
+ orph_lebs += 1;
+#endif
+
+ main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS - log_lebs;
+ main_lebs -= orph_lebs;
+
+ lpt_first = UBIFS_LOG_LNUM + log_lebs;
+ c->lsave_cnt = DEFAULT_LSAVE_CNT;
+ c->max_leb_cnt = c->leb_cnt;
+ err = ubifs_create_dflt_lpt(c, &main_lebs, lpt_first, &lpt_lebs,
+ &big_lpt);
+ if (err)
+ return err;
+
+ dbg_gen("LEB Properties Tree created (LEBs %d-%d)", lpt_first,
+ lpt_first + lpt_lebs - 1);
+
+ main_first = c->leb_cnt - main_lebs;
+
+ /* Create default superblock */
+ tmp = ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size);
+ sup = kzalloc(tmp, GFP_KERNEL);
+ if (!sup)
+ return -ENOMEM;
+
+ tmp64 = (uint64_t)max_buds * c->leb_size;
+ if (big_lpt)
+ sup_flags |= UBIFS_FLG_BIGLPT;
+
+ sup->ch.node_type = UBIFS_SB_NODE;
+ sup->key_hash = UBIFS_KEY_HASH_R5;
+ sup->flags = cpu_to_le32(sup_flags);
+ sup->min_io_size = cpu_to_le32(c->min_io_size);
+ sup->leb_size = cpu_to_le32(c->leb_size);
+ sup->leb_cnt = cpu_to_le32(c->leb_cnt);
+ sup->max_leb_cnt = cpu_to_le32(c->max_leb_cnt);
+ sup->max_bud_bytes = cpu_to_le64(tmp64);
+ sup->log_lebs = cpu_to_le32(log_lebs);
+ sup->lpt_lebs = cpu_to_le32(lpt_lebs);
+ sup->orph_lebs = cpu_to_le32(orph_lebs);
+ sup->jhead_cnt = cpu_to_le32(DEFAULT_JHEADS_CNT);
+ sup->fanout = cpu_to_le32(DEFAULT_FANOUT);
+ sup->lsave_cnt = cpu_to_le32(c->lsave_cnt);
+ sup->fmt_version = cpu_to_le32(UBIFS_FORMAT_VERSION);
+ sup->default_compr = cpu_to_le16(UBIFS_COMPR_LZO);
+ sup->time_gran = cpu_to_le32(DEFAULT_TIME_GRAN);
+
+ generate_random_uuid(sup->uuid);
+
+ main_bytes = (uint64_t)main_lebs * c->leb_size;
+ tmp64 = main_bytes * DEFAULT_RP_PERCENT;
+ do_div(tmp64, 100);
+ if (tmp64 > DEFAULT_MAX_RP_SIZE)
+ tmp64 = DEFAULT_MAX_RP_SIZE;
+ sup->rp_size = cpu_to_le64(tmp64);
+
+ err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0, UBI_LONGTERM);
+ kfree(sup);
+ if (err)
+ return err;
+
+ dbg_gen("default superblock created at LEB 0:0");
+
+ /* Create default master node */
+ mst = kzalloc(c->mst_node_alsz, GFP_KERNEL);
+ if (!mst)
+ return -ENOMEM;
+
+ mst->ch.node_type = UBIFS_MST_NODE;
+ mst->log_lnum = cpu_to_le32(UBIFS_LOG_LNUM);
+ mst->highest_inum = cpu_to_le64(UBIFS_FIRST_INO);
+ mst->cmt_no = 0;
+ mst->root_lnum = cpu_to_le32(main_first + DEFAULT_IDX_LEB);
+ mst->root_offs = 0;
+ tmp = ubifs_idx_node_sz(c, 1);
+ mst->root_len = cpu_to_le32(tmp);
+ mst->gc_lnum = cpu_to_le32(main_first + DEFAULT_GC_LEB);
+ mst->ihead_lnum = cpu_to_le32(main_first + DEFAULT_IDX_LEB);
+ mst->ihead_offs = cpu_to_le32(ALIGN(tmp, c->min_io_size));
+ mst->index_size = cpu_to_le64(ALIGN(tmp, 8));
+ mst->lpt_lnum = cpu_to_le32(c->lpt_lnum);
+ mst->lpt_offs = cpu_to_le32(c->lpt_offs);
+ mst->nhead_lnum = cpu_to_le32(c->nhead_lnum);
+ mst->nhead_offs = cpu_to_le32(c->nhead_offs);
+ mst->ltab_lnum = cpu_to_le32(c->ltab_lnum);
+ mst->ltab_offs = cpu_to_le32(c->ltab_offs);
+ mst->lsave_lnum = cpu_to_le32(c->lsave_lnum);
+ mst->lsave_offs = cpu_to_le32(c->lsave_offs);
+ mst->lscan_lnum = cpu_to_le32(main_first);
+ mst->empty_lebs = cpu_to_le32(main_lebs - 2);
+ mst->idx_lebs = cpu_to_le32(1);
+ mst->leb_cnt = cpu_to_le32(c->leb_cnt);
+
+ /* Calculate lprops statistics */
+ tmp64 = main_bytes;
+ tmp64 -= ALIGN(ubifs_idx_node_sz(c, 1), c->min_io_size);
+ tmp64 -= ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size);
+ mst->total_free = cpu_to_le64(tmp64);
+
+ tmp64 = ALIGN(ubifs_idx_node_sz(c, 1), c->min_io_size);
+ ino_waste = ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size) -
+ UBIFS_INO_NODE_SZ;
+ tmp64 += ino_waste;
+ tmp64 -= ALIGN(ubifs_idx_node_sz(c, 1), 8);
+ mst->total_dirty = cpu_to_le64(tmp64);
+
+ /* The indexing LEB does not contribute to dark space */
+ tmp64 = (c->main_lebs - 1) * c->dark_wm;
+ mst->total_dark = cpu_to_le64(tmp64);
+
+ mst->total_used = cpu_to_le64(UBIFS_INO_NODE_SZ);
+
+ err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM, 0,
+ UBI_UNKNOWN);
+ if (err) {
+ kfree(mst);
+ return err;
+ }
+ err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM + 1, 0,
+ UBI_UNKNOWN);
+ kfree(mst);
+ if (err)
+ return err;
+
+ dbg_gen("default master node created at LEB %d:0", UBIFS_MST_LNUM);
+
+ /* Create the root indexing node */
+ tmp = ubifs_idx_node_sz(c, 1);
+ idx = kzalloc(ALIGN(tmp, c->min_io_size), GFP_KERNEL);
+ if (!idx)
+ return -ENOMEM;
+
+ c->key_fmt = UBIFS_SIMPLE_KEY_FMT;
+ c->key_hash = key_r5_hash;
+
+ idx->ch.node_type = UBIFS_IDX_NODE;
+ idx->child_cnt = cpu_to_le16(1);
+ ino_key_init(c, &key, UBIFS_ROOT_INO);
+ br = ubifs_idx_branch(c, idx, 0);
+ key_write_idx(c, &key, &br->key);
+ br->lnum = cpu_to_le32(main_first + DEFAULT_DATA_LEB);
+ br->len = cpu_to_le32(UBIFS_INO_NODE_SZ);
+ err = ubifs_write_node(c, idx, tmp, main_first + DEFAULT_IDX_LEB, 0,
+ UBI_UNKNOWN);
+ kfree(idx);
+ if (err)
+ return err;
+
+ dbg_gen("default root indexing node created LEB %d:0",
+ main_first + DEFAULT_IDX_LEB);
+
+ /* Create default root inode */
+ tmp = ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size);
+ ino = kzalloc(tmp, GFP_KERNEL);
+ if (!ino)
+ return -ENOMEM;
+
+ ino_key_init_flash(c, &ino->key, UBIFS_ROOT_INO);
+ ino->ch.node_type = UBIFS_INO_NODE;
+ ino->creat_sqnum = cpu_to_le64(++c->max_sqnum);
+ ino->nlink = cpu_to_le32(2);
+ tmp = cpu_to_le64(CURRENT_TIME_SEC.tv_sec);
+ ino->atime_sec = tmp;
+ ino->ctime_sec = tmp;
+ ino->mtime_sec = tmp;
+ ino->atime_nsec = 0;
+ ino->ctime_nsec = 0;
+ ino->mtime_nsec = 0;
+ ino->mode = cpu_to_le32(S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO);
+ ino->size = cpu_to_le64(UBIFS_INO_NODE_SZ);
+
+ /* Set compression enabled by default */
+ ino->flags = cpu_to_le32(UBIFS_COMPR_FL);
+
+ err = ubifs_write_node(c, ino, UBIFS_INO_NODE_SZ,
+ main_first + DEFAULT_DATA_LEB, 0,
+ UBI_UNKNOWN);
+ kfree(ino);
+ if (err)
+ return err;
+
+ dbg_gen("root inode created at LEB %d:0",
+ main_first + DEFAULT_DATA_LEB);
+
+ /*
+ * The first node in the log has to be the commit start node. This is
+ * always the case during normal file-system operation. Write a fake
+ * commit start node to the log.
+ */
+ tmp = ALIGN(UBIFS_CS_NODE_SZ, c->min_io_size);
+ cs = kzalloc(tmp, GFP_KERNEL);
+ if (!cs)
+ return -ENOMEM;
+
+ cs->ch.node_type = UBIFS_CS_NODE;
+ err = ubifs_write_node(c, cs, UBIFS_CS_NODE_SZ, UBIFS_LOG_LNUM,
+ 0, UBI_UNKNOWN);
+ kfree(cs);
+
+ ubifs_msg("default file-system created");
+ return 0;
+}
+
+/**
+ * validate_sb - validate superblock node.
+ * @c: UBIFS file-system description object
+ * @sup: superblock node
+ *
+ * This function validates superblock node @sup. Since most of data was read
+ * from the superblock and stored in @c, the function validates fields in @c
+ * instead. Returns zero in case of success and %-EINVAL in case of validation
+ * failure.
+ */
+static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup)
+{
+ long long max_bytes;
+ int err = 1, min_leb_cnt;
+
+ if (!c->key_hash) {
+ err = 2;
+ goto failed;
+ }
+
+ if (sup->key_fmt != UBIFS_SIMPLE_KEY_FMT) {
+ err = 3;
+ goto failed;
+ }
+
+ if (le32_to_cpu(sup->min_io_size) != c->min_io_size) {
+ ubifs_err("min. I/O unit mismatch: %d in superblock, %d real",
+ le32_to_cpu(sup->min_io_size), c->min_io_size);
+ goto failed;
+ }
+
+ if (le32_to_cpu(sup->leb_size) != c->leb_size) {
+ ubifs_err("LEB size mismatch: %d in superblock, %d real",
+ le32_to_cpu(sup->leb_size), c->leb_size);
+ goto failed;
+ }
+
+ if (c->log_lebs < UBIFS_MIN_LOG_LEBS ||
+ c->lpt_lebs < UBIFS_MIN_LPT_LEBS ||
+ c->orph_lebs < UBIFS_MIN_ORPH_LEBS ||
+ c->main_lebs < UBIFS_MIN_MAIN_LEBS) {
+ err = 4;
+ goto failed;
+ }
+
+ /*
+ * Calculate minimum allowed amount of main area LEBs. This is very
+ * similar to %UBIFS_MIN_LEB_CNT, but we take into account real what we
+ * have just read from the superblock.
+ */
+ min_leb_cnt = UBIFS_SB_LEBS + UBIFS_MST_LEBS + c->log_lebs;
+ min_leb_cnt += c->lpt_lebs + c->orph_lebs + c->jhead_cnt + 6;
+
+ if (c->leb_cnt < min_leb_cnt || c->leb_cnt > c->vi.size) {
+ ubifs_err("bad LEB count: %d in superblock, %d on UBI volume, "
+ "%d minimum required", c->leb_cnt, c->vi.size,
+ min_leb_cnt);
+ goto failed;
+ }
+
+ if (c->max_leb_cnt < c->leb_cnt) {
+ ubifs_err("max. LEB count %d less than LEB count %d",
+ c->max_leb_cnt, c->leb_cnt);
+ goto failed;
+ }
+
+ if (c->main_lebs < UBIFS_MIN_MAIN_LEBS) {
+ err = 7;
+ goto failed;
+ }
+
+ if (c->max_bud_bytes < (long long)c->leb_size * UBIFS_MIN_BUD_LEBS ||
+ c->max_bud_bytes > (long long)c->leb_size * c->main_lebs) {
+ err = 8;
+ goto failed;
+ }
+
+ if (c->jhead_cnt < NONDATA_JHEADS_CNT + 1 ||
+ c->jhead_cnt > NONDATA_JHEADS_CNT + UBIFS_MAX_JHEADS) {
+ err = 9;
+ goto failed;
+ }
+
+ if (c->fanout < UBIFS_MIN_FANOUT ||
+ ubifs_idx_node_sz(c, c->fanout) > c->leb_size) {
+ err = 10;
+ goto failed;
+ }
+
+ if (c->lsave_cnt < 0 || (c->lsave_cnt > DEFAULT_LSAVE_CNT &&
+ c->lsave_cnt > c->max_leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS -
+ c->log_lebs - c->lpt_lebs - c->orph_lebs)) {
+ err = 11;
+ goto failed;
+ }
+
+ if (UBIFS_SB_LEBS + UBIFS_MST_LEBS + c->log_lebs + c->lpt_lebs +
+ c->orph_lebs + c->main_lebs != c->leb_cnt) {
+ err = 12;
+ goto failed;
+ }
+
+ if (c->default_compr < 0 || c->default_compr >= UBIFS_COMPR_TYPES_CNT) {
+ err = 13;
+ goto failed;
+ }
+
+ max_bytes = c->main_lebs * (long long)c->leb_size;
+ if (c->rp_size < 0 || max_bytes < c->rp_size) {
+ err = 14;
+ goto failed;
+ }
+
+ if (le32_to_cpu(sup->time_gran) > 1000000000 ||
+ le32_to_cpu(sup->time_gran) < 1) {
+ err = 15;
+ goto failed;
+ }
+
+ return 0;
+
+failed:
+ ubifs_err("bad superblock, error %d", err);
+ dbg_dump_node(c, sup);
+ return -EINVAL;
+}
+
+/**
+ * ubifs_read_sb_node - read superblock node.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns a pointer to the superblock node or a negative error
+ * code.
+ */
+struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c)
+{
+ struct ubifs_sb_node *sup;
+ int err;
+
+ sup = kmalloc(ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size), GFP_NOFS);
+ if (!sup)
+ return ERR_PTR(-ENOMEM);
+
+ err = ubifs_read_node(c, sup, UBIFS_SB_NODE, UBIFS_SB_NODE_SZ,
+ UBIFS_SB_LNUM, 0);
+ if (err) {
+ kfree(sup);
+ return ERR_PTR(err);
+ }
+
+ return sup;
+}
+
+/**
+ * ubifs_write_sb_node - write superblock node.
+ * @c: UBIFS file-system description object
+ * @sup: superblock node read with 'ubifs_read_sb_node()'
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup)
+{
+ int len = ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size);
+
+ ubifs_prepare_node(c, sup, UBIFS_SB_NODE_SZ, 1);
+ return ubifs_leb_change(c, UBIFS_SB_LNUM, sup, len, UBI_LONGTERM);
+}
+
+/**
+ * ubifs_read_superblock - read superblock.
+ * @c: UBIFS file-system description object
+ *
+ * This function finds, reads and checks the superblock. If an empty UBI volume
+ * is being mounted, this function creates default superblock. Returns zero in
+ * case of success, and a negative error code in case of failure.
+ */
+int ubifs_read_superblock(struct ubifs_info *c)
+{
+ int err, sup_flags;
+ struct ubifs_sb_node *sup;
+
+ if (c->empty) {
+ err = create_default_filesystem(c);
+ if (err)
+ return err;
+ }
+
+ sup = ubifs_read_sb_node(c);
+ if (IS_ERR(sup))
+ return PTR_ERR(sup);
+
+ /*
+ * The software supports all previous versions but not future versions,
+ * due to the unavailability of time-travelling equipment.
+ */
+ c->fmt_version = le32_to_cpu(sup->fmt_version);
+ if (c->fmt_version > UBIFS_FORMAT_VERSION) {
+ ubifs_err("on-flash format version is %d, but software only "
+ "supports up to version %d", c->fmt_version,
+ UBIFS_FORMAT_VERSION);
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (c->fmt_version < 3) {
+ ubifs_err("on-flash format version %d is not supported",
+ c->fmt_version);
+ err = -EINVAL;
+ goto out;
+ }
+
+ switch (sup->key_hash) {
+ case UBIFS_KEY_HASH_R5:
+ c->key_hash = key_r5_hash;
+ c->key_hash_type = UBIFS_KEY_HASH_R5;
+ break;
+
+ case UBIFS_KEY_HASH_TEST:
+ c->key_hash = key_test_hash;
+ c->key_hash_type = UBIFS_KEY_HASH_TEST;
+ break;
+ };
+
+ c->key_fmt = sup->key_fmt;
+
+ switch (c->key_fmt) {
+ case UBIFS_SIMPLE_KEY_FMT:
+ c->key_len = UBIFS_SK_LEN;
+ break;
+ default:
+ ubifs_err("unsupported key format");
+ err = -EINVAL;
+ goto out;
+ }
+
+ c->leb_cnt = le32_to_cpu(sup->leb_cnt);
+ c->max_leb_cnt = le32_to_cpu(sup->max_leb_cnt);
+ c->max_bud_bytes = le64_to_cpu(sup->max_bud_bytes);
+ c->log_lebs = le32_to_cpu(sup->log_lebs);
+ c->lpt_lebs = le32_to_cpu(sup->lpt_lebs);
+ c->orph_lebs = le32_to_cpu(sup->orph_lebs);
+ c->jhead_cnt = le32_to_cpu(sup->jhead_cnt) + NONDATA_JHEADS_CNT;
+ c->fanout = le32_to_cpu(sup->fanout);
+ c->lsave_cnt = le32_to_cpu(sup->lsave_cnt);
+ c->default_compr = le16_to_cpu(sup->default_compr);
+ c->rp_size = le64_to_cpu(sup->rp_size);
+ c->rp_uid = le32_to_cpu(sup->rp_uid);
+ c->rp_gid = le32_to_cpu(sup->rp_gid);
+ sup_flags = le32_to_cpu(sup->flags);
+
+ c->vfs_sb->s_time_gran = le32_to_cpu(sup->time_gran);
+
+ memcpy(&c->uuid, &sup->uuid, 16);
+
+ c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT);
+
+ /* Automatically increase file system size to the maximum size */
+ c->old_leb_cnt = c->leb_cnt;
+ if (c->leb_cnt < c->vi.size && c->leb_cnt < c->max_leb_cnt) {
+ c->leb_cnt = min_t(int, c->max_leb_cnt, c->vi.size);
+ if (c->vfs_sb->s_flags & MS_RDONLY)
+ dbg_mnt("Auto resizing (ro) from %d LEBs to %d LEBs",
+ c->old_leb_cnt, c->leb_cnt);
+ else {
+ dbg_mnt("Auto resizing (sb) from %d LEBs to %d LEBs",
+ c->old_leb_cnt, c->leb_cnt);
+ sup->leb_cnt = cpu_to_le32(c->leb_cnt);
+ err = ubifs_write_sb_node(c, sup);
+ if (err)
+ goto out;
+ c->old_leb_cnt = c->leb_cnt;
+ }
+ }
+
+ c->log_bytes = (long long)c->log_lebs * c->leb_size;
+ c->log_last = UBIFS_LOG_LNUM + c->log_lebs - 1;
+ c->lpt_first = UBIFS_LOG_LNUM + c->log_lebs;
+ c->lpt_last = c->lpt_first + c->lpt_lebs - 1;
+ c->orph_first = c->lpt_last + 1;
+ c->orph_last = c->orph_first + c->orph_lebs - 1;
+ c->main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS;
+ c->main_lebs -= c->log_lebs + c->lpt_lebs + c->orph_lebs;
+ c->main_first = c->leb_cnt - c->main_lebs;
+ c->report_rp_size = ubifs_reported_space(c, c->rp_size);
+
+ err = validate_sb(c, sup);
+out:
+ kfree(sup);
+ return err;
+}
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
new file mode 100644
index 00000000000..acf5c5fffc6
--- /dev/null
+++ b/fs/ubifs/scan.c
@@ -0,0 +1,362 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ * Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements the scan which is a general-purpose function for
+ * determining what nodes are in an eraseblock. The scan is used to replay the
+ * journal, to do garbage collection. for the TNC in-the-gaps method, and by
+ * debugging functions.
+ */
+
+#include "ubifs.h"
+
+/**
+ * scan_padding_bytes - scan for padding bytes.
+ * @buf: buffer to scan
+ * @len: length of buffer
+ *
+ * This function returns the number of padding bytes on success and
+ * %SCANNED_GARBAGE on failure.
+ */
+static int scan_padding_bytes(void *buf, int len)
+{
+ int pad_len = 0, max_pad_len = min_t(int, UBIFS_PAD_NODE_SZ, len);
+ uint8_t *p = buf;
+
+ dbg_scan("not a node");
+
+ while (pad_len < max_pad_len && *p++ == UBIFS_PADDING_BYTE)
+ pad_len += 1;
+
+ if (!pad_len || (pad_len & 7))
+ return SCANNED_GARBAGE;
+
+ dbg_scan("%d padding bytes", pad_len);
+
+ return pad_len;
+}
+
+/**
+ * ubifs_scan_a_node - scan for a node or padding.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to scan
+ * @len: length of buffer
+ * @lnum: logical eraseblock number
+ * @offs: offset within the logical eraseblock
+ * @quiet: print no messages
+ *
+ * This function returns a scanning code to indicate what was scanned.
+ */
+int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
+ int offs, int quiet)
+{
+ struct ubifs_ch *ch = buf;
+ uint32_t magic;
+
+ magic = le32_to_cpu(ch->magic);
+
+ if (magic == 0xFFFFFFFF) {
+ dbg_scan("hit empty space");
+ return SCANNED_EMPTY_SPACE;
+ }
+
+ if (magic != UBIFS_NODE_MAGIC)
+ return scan_padding_bytes(buf, len);
+
+ if (len < UBIFS_CH_SZ)
+ return SCANNED_GARBAGE;
+
+ dbg_scan("scanning %s", dbg_ntype(ch->node_type));
+
+ if (ubifs_check_node(c, buf, lnum, offs, quiet))
+ return SCANNED_A_CORRUPT_NODE;
+
+ if (ch->node_type == UBIFS_PAD_NODE) {
+ struct ubifs_pad_node *pad = buf;
+ int pad_len = le32_to_cpu(pad->pad_len);
+ int node_len = le32_to_cpu(ch->len);
+
+ /* Validate the padding node */
+ if (pad_len < 0 ||
+ offs + node_len + pad_len > c->leb_size) {
+ if (!quiet) {
+ ubifs_err("bad pad node at LEB %d:%d",
+ lnum, offs);
+ dbg_dump_node(c, pad);
+ }
+ return SCANNED_A_BAD_PAD_NODE;
+ }
+
+ /* Make the node pads to 8-byte boundary */
+ if ((node_len + pad_len) & 7) {
+ if (!quiet) {
+ dbg_err("bad padding length %d - %d",
+ offs, offs + node_len + pad_len);
+ }
+ return SCANNED_A_BAD_PAD_NODE;
+ }
+
+ dbg_scan("%d bytes padded, offset now %d",
+ pad_len, ALIGN(offs + node_len + pad_len, 8));
+
+ return node_len + pad_len;
+ }
+
+ return SCANNED_A_NODE;
+}
+
+/**
+ * ubifs_start_scan - create LEB scanning information at start of scan.
+ * @c: UBIFS file-system description object
+ * @lnum: logical eraseblock number
+ * @offs: offset to start at (usually zero)
+ * @sbuf: scan buffer (must be c->leb_size)
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum,
+ int offs, void *sbuf)
+{
+ struct ubifs_scan_leb *sleb;
+ int err;
+
+ dbg_scan("scan LEB %d:%d", lnum, offs);
+
+ sleb = kzalloc(sizeof(struct ubifs_scan_leb), GFP_NOFS);
+ if (!sleb)
+ return ERR_PTR(-ENOMEM);
+
+ sleb->lnum = lnum;
+ INIT_LIST_HEAD(&sleb->nodes);
+ sleb->buf = sbuf;
+
+ err = ubi_read(c->ubi, lnum, sbuf + offs, offs, c->leb_size - offs);
+ if (err && err != -EBADMSG) {
+ ubifs_err("cannot read %d bytes from LEB %d:%d,"
+ " error %d", c->leb_size - offs, lnum, offs, err);
+ kfree(sleb);
+ return ERR_PTR(err);
+ }
+
+ if (err == -EBADMSG)
+ sleb->ecc = 1;
+
+ return sleb;
+}
+
+/**
+ * ubifs_end_scan - update LEB scanning information at end of scan.
+ * @c: UBIFS file-system description object
+ * @sleb: scanning information
+ * @lnum: logical eraseblock number
+ * @offs: offset to start at (usually zero)
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+void ubifs_end_scan(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+ int lnum, int offs)
+{
+ lnum = lnum;
+ dbg_scan("stop scanning LEB %d at offset %d", lnum, offs);
+ ubifs_assert(offs % c->min_io_size == 0);
+
+ sleb->endpt = ALIGN(offs, c->min_io_size);
+}
+
+/**
+ * ubifs_add_snod - add a scanned node to LEB scanning information.
+ * @c: UBIFS file-system description object
+ * @sleb: scanning information
+ * @buf: buffer containing node
+ * @offs: offset of node on flash
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+ void *buf, int offs)
+{
+ struct ubifs_ch *ch = buf;
+ struct ubifs_ino_node *ino = buf;
+ struct ubifs_scan_node *snod;
+
+ snod = kzalloc(sizeof(struct ubifs_scan_node), GFP_NOFS);
+ if (!snod)
+ return -ENOMEM;
+
+ snod->sqnum = le64_to_cpu(ch->sqnum);
+ snod->type = ch->node_type;
+ snod->offs = offs;
+ snod->len = le32_to_cpu(ch->len);
+ snod->node = buf;
+
+ switch (ch->node_type) {
+ case UBIFS_INO_NODE:
+ case UBIFS_DENT_NODE:
+ case UBIFS_XENT_NODE:
+ case UBIFS_DATA_NODE:
+ case UBIFS_TRUN_NODE:
+ /*
+ * The key is in the same place in all keyed
+ * nodes.
+ */
+ key_read(c, &ino->key, &snod->key);
+ break;
+ }
+ list_add_tail(&snod->list, &sleb->nodes);
+ sleb->nodes_cnt += 1;
+ return 0;
+}
+
+/**
+ * ubifs_scanned_corruption - print information after UBIFS scanned corruption.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number of corruption
+ * @offs: offset of corruption
+ * @buf: buffer containing corruption
+ */
+void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs,
+ void *buf)
+{
+ int len;
+
+ ubifs_err("corrupted data at LEB %d:%d", lnum, offs);
+ if (dbg_failure_mode)
+ return;
+ len = c->leb_size - offs;
+ if (len > 4096)
+ len = 4096;
+ dbg_err("first %d bytes from LEB %d:%d", len, lnum, offs);
+ print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 4, buf, len, 1);
+}
+
+/**
+ * ubifs_scan - scan a logical eraseblock.
+ * @c: UBIFS file-system description object
+ * @lnum: logical eraseblock number
+ * @offs: offset to start at (usually zero)
+ * @sbuf: scan buffer (must be c->leb_size)
+ *
+ * This function scans LEB number @lnum and returns complete information about
+ * its contents. Returns an error code in case of failure.
+ */
+struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
+ int offs, void *sbuf)
+{
+ void *buf = sbuf + offs;
+ int err, len = c->leb_size - offs;
+ struct ubifs_scan_leb *sleb;
+
+ sleb = ubifs_start_scan(c, lnum, offs, sbuf);
+ if (IS_ERR(sleb))
+ return sleb;
+
+ while (len >= 8) {
+ struct ubifs_ch *ch = buf;
+ int node_len, ret;
+
+ dbg_scan("look at LEB %d:%d (%d bytes left)",
+ lnum, offs, len);
+
+ cond_resched();
+
+ ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 0);
+
+ if (ret > 0) {
+ /* Padding bytes or a valid padding node */
+ offs += ret;
+ buf += ret;
+ len -= ret;
+ continue;
+ }
+
+ if (ret == SCANNED_EMPTY_SPACE)
+ /* Empty space is checked later */
+ break;
+
+ switch (ret) {
+ case SCANNED_GARBAGE:
+ dbg_err("garbage");
+ goto corrupted;
+ case SCANNED_A_NODE:
+ break;
+ case SCANNED_A_CORRUPT_NODE:
+ case SCANNED_A_BAD_PAD_NODE:
+ dbg_err("bad node");
+ goto corrupted;
+ default:
+ dbg_err("unknown");
+ goto corrupted;
+ }
+
+ err = ubifs_add_snod(c, sleb, buf, offs);
+ if (err)
+ goto error;
+
+ node_len = ALIGN(le32_to_cpu(ch->len), 8);
+ offs += node_len;
+ buf += node_len;
+ len -= node_len;
+ }
+
+ if (offs % c->min_io_size)
+ goto corrupted;
+
+ ubifs_end_scan(c, sleb, lnum, offs);
+
+ for (; len > 4; offs += 4, buf = buf + 4, len -= 4)
+ if (*(uint32_t *)buf != 0xffffffff)
+ break;
+ for (; len; offs++, buf++, len--)
+ if (*(uint8_t *)buf != 0xff) {
+ ubifs_err("corrupt empty space at LEB %d:%d",
+ lnum, offs);
+ goto corrupted;
+ }
+
+ return sleb;
+
+corrupted:
+ ubifs_scanned_corruption(c, lnum, offs, buf);
+ err = -EUCLEAN;
+error:
+ ubifs_err("LEB %d scanning failed", lnum);
+ ubifs_scan_destroy(sleb);
+ return ERR_PTR(err);
+}
+
+/**
+ * ubifs_scan_destroy - destroy LEB scanning information.
+ * @sleb: scanning information to free
+ */
+void ubifs_scan_destroy(struct ubifs_scan_leb *sleb)
+{
+ struct ubifs_scan_node *node;
+ struct list_head *head;
+
+ head = &sleb->nodes;
+ while (!list_empty(head)) {
+ node = list_entry(head->next, struct ubifs_scan_node, list);
+ list_del(&node->list);
+ kfree(node);
+ }
+ kfree(sleb);
+}
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
new file mode 100644
index 00000000000..f248533841a
--- /dev/null
+++ b/fs/ubifs/shrinker.c
@@ -0,0 +1,322 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ * Adrian Hunter
+ */
+
+/*
+ * This file implements UBIFS shrinker which evicts clean znodes from the TNC
+ * tree when Linux VM needs more RAM.
+ *
+ * We do not implement any LRU lists to find oldest znodes to free because it
+ * would add additional overhead to the file system fast paths. So the shrinker
+ * just walks the TNC tree when searching for znodes to free.
+ *
+ * If the root of a TNC sub-tree is clean and old enough, then the children are
+ * also clean and old enough. So the shrinker walks the TNC in level order and
+ * dumps entire sub-trees.
+ *
+ * The age of znodes is just the time-stamp when they were last looked at.
+ * The current shrinker first tries to evict old znodes, then young ones.
+ *
+ * Since the shrinker is global, it has to protect against races with FS
+ * un-mounts, which is done by the 'ubifs_infos_lock' and 'c->umount_mutex'.
+ */
+
+#include "ubifs.h"
+
+/* List of all UBIFS file-system instances */
+LIST_HEAD(ubifs_infos);
+
+/*
+ * We number each shrinker run and record the number on the ubifs_info structure
+ * so that we can easily work out which ubifs_info structures have already been
+ * done by the current run.
+ */
+static unsigned int shrinker_run_no;
+
+/* Protects 'ubifs_infos' list */
+DEFINE_SPINLOCK(ubifs_infos_lock);
+
+/* Global clean znode counter (for all mounted UBIFS instances) */
+atomic_long_t ubifs_clean_zn_cnt;
+
+/**
+ * shrink_tnc - shrink TNC tree.
+ * @c: UBIFS file-system description object
+ * @nr: number of znodes to free
+ * @age: the age of znodes to free
+ * @contention: if any contention, this is set to %1
+ *
+ * This function traverses TNC tree and frees clean znodes. It does not free
+ * clean znodes which younger then @age. Returns number of freed znodes.
+ */
+static int shrink_tnc(struct ubifs_info *c, int nr, int age, int *contention)
+{
+ int total_freed = 0;
+ struct ubifs_znode *znode, *zprev;
+ int time = get_seconds();
+
+ ubifs_assert(mutex_is_locked(&c->umount_mutex));
+ ubifs_assert(mutex_is_locked(&c->tnc_mutex));
+
+ if (!c->zroot.znode || atomic_long_read(&c->clean_zn_cnt) == 0)
+ return 0;
+
+ /*
+ * Traverse the TNC tree in levelorder manner, so that it is possible
+ * to destroy large sub-trees. Indeed, if a znode is old, then all its
+ * children are older or of the same age.
+ *
+ * Note, we are holding 'c->tnc_mutex', so we do not have to lock the
+ * 'c->space_lock' when _reading_ 'c->clean_zn_cnt', because it is
+ * changed only when the 'c->tnc_mutex' is held.
+ */
+ zprev = NULL;
+ znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL);
+ while (znode && total_freed < nr &&
+ atomic_long_read(&c->clean_zn_cnt) > 0) {
+ int freed;
+
+ /*
+ * If the znode is clean, but it is in the 'c->cnext' list, this
+ * means that this znode has just been written to flash as a
+ * part of commit and was marked clean. They will be removed
+ * from the list at end commit. We cannot change the list,
+ * because it is not protected by any mutex (design decision to
+ * make commit really independent and parallel to main I/O). So
+ * we just skip these znodes.
+ *
+ * Note, the 'clean_zn_cnt' counters are not updated until
+ * after the commit, so the UBIFS shrinker does not report
+ * the znodes which are in the 'c->cnext' list as freeable.
+ *
+ * Also note, if the root of a sub-tree is not in 'c->cnext',
+ * then the whole sub-tree is not in 'c->cnext' as well, so it
+ * is safe to dump whole sub-tree.
+ */
+
+ if (znode->cnext) {
+ /*
+ * Very soon these znodes will be removed from the list
+ * and become freeable.
+ */
+ *contention = 1;
+ } else if (!ubifs_zn_dirty(znode) &&
+ abs(time - znode->time) >= age) {
+ if (znode->parent)
+ znode->parent->zbranch[znode->iip].znode = NULL;
+ else
+ c->zroot.znode = NULL;
+
+ freed = ubifs_destroy_tnc_subtree(znode);
+ atomic_long_sub(freed, &ubifs_clean_zn_cnt);
+ atomic_long_sub(freed, &c->clean_zn_cnt);
+ ubifs_assert(atomic_long_read(&c->clean_zn_cnt) >= 0);
+ total_freed += freed;
+ znode = zprev;
+ }
+
+ if (unlikely(!c->zroot.znode))
+ break;
+
+ zprev = znode;
+ znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode);
+ cond_resched();
+ }
+
+ return total_freed;
+}
+
+/**
+ * shrink_tnc_trees - shrink UBIFS TNC trees.
+ * @nr: number of znodes to free
+ * @age: the age of znodes to free
+ * @contention: if any contention, this is set to %1
+ *
+ * This function walks the list of mounted UBIFS file-systems and frees clean
+ * znodes which are older then @age, until at least @nr znodes are freed.
+ * Returns the number of freed znodes.
+ */
+static int shrink_tnc_trees(int nr, int age, int *contention)
+{
+ struct ubifs_info *c;
+ struct list_head *p;
+ unsigned int run_no;
+ int freed = 0;
+
+ spin_lock(&ubifs_infos_lock);
+ do {
+ run_no = ++shrinker_run_no;
+ } while (run_no == 0);
+ /* Iterate over all mounted UBIFS file-systems and try to shrink them */
+ p = ubifs_infos.next;
+ while (p != &ubifs_infos) {
+ c = list_entry(p, struct ubifs_info, infos_list);
+ /*
+ * We move the ones we do to the end of the list, so we stop
+ * when we see one we have already done.
+ */
+ if (c->shrinker_run_no == run_no)
+ break;
+ if (!mutex_trylock(&c->umount_mutex)) {
+ /* Some un-mount is in progress, try next FS */
+ *contention = 1;
+ p = p->next;
+ continue;
+ }
+ /*
+ * We're holding 'c->umount_mutex', so the file-system won't go
+ * away.
+ */
+ if (!mutex_trylock(&c->tnc_mutex)) {
+ mutex_unlock(&c->umount_mutex);
+ *contention = 1;
+ p = p->next;
+ continue;
+ }
+ spin_unlock(&ubifs_infos_lock);
+ /*
+ * OK, now we have TNC locked, the file-system cannot go away -
+ * it is safe to reap the cache.
+ */
+ c->shrinker_run_no = run_no;
+ freed += shrink_tnc(c, nr, age, contention);
+ mutex_unlock(&c->tnc_mutex);
+ spin_lock(&ubifs_infos_lock);
+ /* Get the next list element before we move this one */
+ p = p->next;
+ /*
+ * Move this one to the end of the list to provide some
+ * fairness.
+ */
+ list_del(&c->infos_list);
+ list_add_tail(&c->infos_list, &ubifs_infos);
+ mutex_unlock(&c->umount_mutex);
+ if (freed >= nr)
+ break;
+ }
+ spin_unlock(&ubifs_infos_lock);
+ return freed;
+}
+
+/**
+ * kick_a_thread - kick a background thread to start commit.
+ *
+ * This function kicks a background thread to start background commit. Returns
+ * %-1 if a thread was kicked or there is another reason to assume the memory
+ * will soon be freed or become freeable. If there are no dirty znodes, returns
+ * %0.
+ */
+static int kick_a_thread(void)
+{
+ int i;
+ struct ubifs_info *c;
+
+ /*
+ * Iterate over all mounted UBIFS file-systems and find out if there is
+ * already an ongoing commit operation there. If no, then iterate for
+ * the second time and initiate background commit.
+ */
+ spin_lock(&ubifs_infos_lock);
+ for (i = 0; i < 2; i++) {
+ list_for_each_entry(c, &ubifs_infos, infos_list) {
+ long dirty_zn_cnt;
+
+ if (!mutex_trylock(&c->umount_mutex)) {
+ /*
+ * Some un-mount is in progress, it will
+ * certainly free memory, so just return.
+ */
+ spin_unlock(&ubifs_infos_lock);
+ return -1;
+ }
+
+ dirty_zn_cnt = atomic_long_read(&c->dirty_zn_cnt);
+
+ if (!dirty_zn_cnt || c->cmt_state == COMMIT_BROKEN ||
+ c->ro_media) {
+ mutex_unlock(&c->umount_mutex);
+ continue;
+ }
+
+ if (c->cmt_state != COMMIT_RESTING) {
+ spin_unlock(&ubifs_infos_lock);
+ mutex_unlock(&c->umount_mutex);
+ return -1;
+ }
+
+ if (i == 1) {
+ list_del(&c->infos_list);
+ list_add_tail(&c->infos_list, &ubifs_infos);
+ spin_unlock(&ubifs_infos_lock);
+
+ ubifs_request_bg_commit(c);
+ mutex_unlock(&c->umount_mutex);
+ return -1;
+ }
+ mutex_unlock(&c->umount_mutex);
+ }
+ }
+ spin_unlock(&ubifs_infos_lock);
+
+ return 0;
+}
+
+int ubifs_shrinker(int nr, gfp_t gfp_mask)
+{
+ int freed, contention = 0;
+ long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt);
+
+ if (nr == 0)
+ return clean_zn_cnt;
+
+ if (!clean_zn_cnt) {
+ /*
+ * No clean znodes, nothing to reap. All we can do in this case
+ * is to kick background threads to start commit, which will
+ * probably make clean znodes which, in turn, will be freeable.
+ * And we return -1 which means will make VM call us again
+ * later.
+ */
+ dbg_tnc("no clean znodes, kick a thread");
+ return kick_a_thread();
+ }
+
+ freed = shrink_tnc_trees(nr, OLD_ZNODE_AGE, &contention);
+ if (freed >= nr)
+ goto out;
+
+ dbg_tnc("not enough old znodes, try to free young ones");
+ freed += shrink_tnc_trees(nr - freed, YOUNG_ZNODE_AGE, &contention);
+ if (freed >= nr)
+ goto out;
+
+ dbg_tnc("not enough young znodes, free all");
+ freed += shrink_tnc_trees(nr - freed, 0, &contention);
+
+ if (!freed && contention) {
+ dbg_tnc("freed nothing, but contention");
+ return -1;
+ }
+
+out:
+ dbg_tnc("%d znodes were freed, requested %d", freed, nr);
+ return freed;
+}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
new file mode 100644
index 00000000000..9a9220333b3
--- /dev/null
+++ b/fs/ubifs/super.c
@@ -0,0 +1,1968 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ * Adrian Hunter
+ */
+
+/*
+ * This file implements UBIFS initialization and VFS superblock operations. Some
+ * initialization stuff which is rather large and complex is placed at
+ * corresponding subsystems, but most of it is here.
+ */
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/kthread.h>
+#include <linux/parser.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include "ubifs.h"
+
+/* Slab cache for UBIFS inodes */
+struct kmem_cache *ubifs_inode_slab;
+
+/* UBIFS TNC shrinker description */
+static struct shrinker ubifs_shrinker_info = {
+ .shrink = ubifs_shrinker,
+ .seeks = DEFAULT_SEEKS,
+};
+
+/**
+ * validate_inode - validate inode.
+ * @c: UBIFS file-system description object
+ * @inode: the inode to validate
+ *
+ * This is a helper function for 'ubifs_iget()' which validates various fields
+ * of a newly built inode to make sure they contain sane values and prevent
+ * possible vulnerabilities. Returns zero if the inode is all right and
+ * a non-zero error code if not.
+ */
+static int validate_inode(struct ubifs_info *c, const struct inode *inode)
+{
+ int err;
+ const struct ubifs_inode *ui = ubifs_inode(inode);
+
+ if (inode->i_size > c->max_inode_sz) {
+ ubifs_err("inode is too large (%lld)",
+ (long long)inode->i_size);
+ return 1;
+ }
+
+ if (ui->compr_type < 0 || ui->compr_type >= UBIFS_COMPR_TYPES_CNT) {
+ ubifs_err("unknown compression type %d", ui->compr_type);
+ return 2;
+ }
+
+ if (ui->xattr_names + ui->xattr_cnt > XATTR_LIST_MAX)
+ return 3;
+
+ if (ui->data_len < 0 || ui->data_len > UBIFS_MAX_INO_DATA)
+ return 4;
+
+ if (ui->xattr && (inode->i_mode & S_IFMT) != S_IFREG)
+ return 5;
+
+ if (!ubifs_compr_present(ui->compr_type)) {
+ ubifs_warn("inode %lu uses '%s' compression, but it was not "
+ "compiled in", inode->i_ino,
+ ubifs_compr_name(ui->compr_type));
+ }
+
+ err = dbg_check_dir_size(c, inode);
+ return err;
+}
+
+struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
+{
+ int err;
+ union ubifs_key key;
+ struct ubifs_ino_node *ino;
+ struct ubifs_info *c = sb->s_fs_info;
+ struct inode *inode;
+ struct ubifs_inode *ui;
+
+ dbg_gen("inode %lu", inum);
+
+ inode = iget_locked(sb, inum);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+ if (!(inode->i_state & I_NEW))
+ return inode;
+ ui = ubifs_inode(inode);
+
+ ino = kmalloc(UBIFS_MAX_INO_NODE_SZ, GFP_NOFS);
+ if (!ino) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ ino_key_init(c, &key, inode->i_ino);
+
+ err = ubifs_tnc_lookup(c, &key, ino);
+ if (err)
+ goto out_ino;
+
+ inode->i_flags |= (S_NOCMTIME | S_NOATIME);
+ inode->i_nlink = le32_to_cpu(ino->nlink);
+ inode->i_uid = le32_to_cpu(ino->uid);
+ inode->i_gid = le32_to_cpu(ino->gid);
+ inode->i_atime.tv_sec = (int64_t)le64_to_cpu(ino->atime_sec);
+ inode->i_atime.tv_nsec = le32_to_cpu(ino->atime_nsec);
+ inode->i_mtime.tv_sec = (int64_t)le64_to_cpu(ino->mtime_sec);
+ inode->i_mtime.tv_nsec = le32_to_cpu(ino->mtime_nsec);
+ inode->i_ctime.tv_sec = (int64_t)le64_to_cpu(ino->ctime_sec);
+ inode->i_ctime.tv_nsec = le32_to_cpu(ino->ctime_nsec);
+ inode->i_mode = le32_to_cpu(ino->mode);
+ inode->i_size = le64_to_cpu(ino->size);
+
+ ui->data_len = le32_to_cpu(ino->data_len);
+ ui->flags = le32_to_cpu(ino->flags);
+ ui->compr_type = le16_to_cpu(ino->compr_type);
+ ui->creat_sqnum = le64_to_cpu(ino->creat_sqnum);
+ ui->xattr_cnt = le32_to_cpu(ino->xattr_cnt);
+ ui->xattr_size = le32_to_cpu(ino->xattr_size);
+ ui->xattr_names = le32_to_cpu(ino->xattr_names);
+ ui->synced_i_size = ui->ui_size = inode->i_size;
+
+ ui->xattr = (ui->flags & UBIFS_XATTR_FL) ? 1 : 0;
+
+ err = validate_inode(c, inode);
+ if (err)
+ goto out_invalid;
+
+ /* Disable read-ahead */
+ inode->i_mapping->backing_dev_info = &c->bdi;
+
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFREG:
+ inode->i_mapping->a_ops = &ubifs_file_address_operations;
+ inode->i_op = &ubifs_file_inode_operations;
+ inode->i_fop = &ubifs_file_operations;
+ if (ui->xattr) {
+ ui->data = kmalloc(ui->data_len + 1, GFP_NOFS);
+ if (!ui->data) {
+ err = -ENOMEM;
+ goto out_ino;
+ }
+ memcpy(ui->data, ino->data, ui->data_len);
+ ((char *)ui->data)[ui->data_len] = '\0';
+ } else if (ui->data_len != 0) {
+ err = 10;
+ goto out_invalid;
+ }
+ break;
+ case S_IFDIR:
+ inode->i_op = &ubifs_dir_inode_operations;
+ inode->i_fop = &ubifs_dir_operations;
+ if (ui->data_len != 0) {
+ err = 11;
+ goto out_invalid;
+ }
+ break;
+ case S_IFLNK:
+ inode->i_op = &ubifs_symlink_inode_operations;
+ if (ui->data_len <= 0 || ui->data_len > UBIFS_MAX_INO_DATA) {
+ err = 12;
+ goto out_invalid;
+ }
+ ui->data = kmalloc(ui->data_len + 1, GFP_NOFS);
+ if (!ui->data) {
+ err = -ENOMEM;
+ goto out_ino;
+ }
+ memcpy(ui->data, ino->data, ui->data_len);
+ ((char *)ui->data)[ui->data_len] = '\0';
+ break;
+ case S_IFBLK:
+ case S_IFCHR:
+ {
+ dev_t rdev;
+ union ubifs_dev_desc *dev;
+
+ ui->data = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS);
+ if (!ui->data) {
+ err = -ENOMEM;
+ goto out_ino;
+ }
+
+ dev = (union ubifs_dev_desc *)ino->data;
+ if (ui->data_len == sizeof(dev->new))
+ rdev = new_decode_dev(le32_to_cpu(dev->new));
+ else if (ui->data_len == sizeof(dev->huge))
+ rdev = huge_decode_dev(le64_to_cpu(dev->huge));
+ else {
+ err = 13;
+ goto out_invalid;
+ }
+ memcpy(ui->data, ino->data, ui->data_len);
+ inode->i_op = &ubifs_file_inode_operations;
+ init_special_inode(inode, inode->i_mode, rdev);
+ break;
+ }
+ case S_IFSOCK:
+ case S_IFIFO:
+ inode->i_op = &ubifs_file_inode_operations;
+ init_special_inode(inode, inode->i_mode, 0);
+ if (ui->data_len != 0) {
+ err = 14;
+ goto out_invalid;
+ }
+ break;
+ default:
+ err = 15;
+ goto out_invalid;
+ }
+
+ kfree(ino);
+ ubifs_set_inode_flags(inode);
+ unlock_new_inode(inode);
+ return inode;
+
+out_invalid:
+ ubifs_err("inode %lu validation failed, error %d", inode->i_ino, err);
+ dbg_dump_node(c, ino);
+ dbg_dump_inode(c, inode);
+ err = -EINVAL;
+out_ino:
+ kfree(ino);
+out:
+ ubifs_err("failed to read inode %lu, error %d", inode->i_ino, err);
+ iget_failed(inode);
+ return ERR_PTR(err);
+}
+
+static struct inode *ubifs_alloc_inode(struct super_block *sb)
+{
+ struct ubifs_inode *ui;
+
+ ui = kmem_cache_alloc(ubifs_inode_slab, GFP_NOFS);
+ if (!ui)
+ return NULL;
+
+ memset((void *)ui + sizeof(struct inode), 0,
+ sizeof(struct ubifs_inode) - sizeof(struct inode));
+ mutex_init(&ui->ui_mutex);
+ spin_lock_init(&ui->ui_lock);
+ return &ui->vfs_inode;
+};
+
+static void ubifs_destroy_inode(struct inode *inode)
+{
+ struct ubifs_inode *ui = ubifs_inode(inode);
+
+ kfree(ui->data);
+ kmem_cache_free(ubifs_inode_slab, inode);
+}
+
+/*
+ * Note, Linux write-back code calls this without 'i_mutex'.
+ */
+static int ubifs_write_inode(struct inode *inode, int wait)
+{
+ int err = 0;
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
+ struct ubifs_inode *ui = ubifs_inode(inode);
+
+ ubifs_assert(!ui->xattr);
+ if (is_bad_inode(inode))
+ return 0;
+
+ mutex_lock(&ui->ui_mutex);
+ /*
+ * Due to races between write-back forced by budgeting
+ * (see 'sync_some_inodes()') and pdflush write-back, the inode may
+ * have already been synchronized, do not do this again. This might
+ * also happen if it was synchronized in an VFS operation, e.g.
+ * 'ubifs_link()'.
+ */
+ if (!ui->dirty) {
+ mutex_unlock(&ui->ui_mutex);
+ return 0;
+ }
+
+ /*
+ * As an optimization, do not write orphan inodes to the media just
+ * because this is not needed.
+ */
+ dbg_gen("inode %lu, mode %#x, nlink %u",
+ inode->i_ino, (int)inode->i_mode, inode->i_nlink);
+ if (inode->i_nlink) {
+ err = ubifs_jnl_write_inode(c, inode);
+ if (err)
+ ubifs_err("can't write inode %lu, error %d",
+ inode->i_ino, err);
+ }
+
+ ui->dirty = 0;
+ mutex_unlock(&ui->ui_mutex);
+ ubifs_release_dirty_inode_budget(c, ui);
+ return err;
+}
+
+static void ubifs_delete_inode(struct inode *inode)
+{
+ int err;
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
+ struct ubifs_inode *ui = ubifs_inode(inode);
+
+ if (ui->xattr)
+ /*
+ * Extended attribute inode deletions are fully handled in
+ * 'ubifs_removexattr()'. These inodes are special and have
+ * limited usage, so there is nothing to do here.
+ */
+ goto out;
+
+ dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode);
+ ubifs_assert(!atomic_read(&inode->i_count));
+ ubifs_assert(inode->i_nlink == 0);
+
+ truncate_inode_pages(&inode->i_data, 0);
+ if (is_bad_inode(inode))
+ goto out;
+
+ ui->ui_size = inode->i_size = 0;
+ err = ubifs_jnl_delete_inode(c, inode);
+ if (err)
+ /*
+ * Worst case we have a lost orphan inode wasting space, so a
+ * simple error message is OK here.
+ */
+ ubifs_err("can't delete inode %lu, error %d",
+ inode->i_ino, err);
+
+out:
+ if (ui->dirty)
+ ubifs_release_dirty_inode_budget(c, ui);
+ clear_inode(inode);
+}
+
+static void ubifs_dirty_inode(struct inode *inode)
+{
+ struct ubifs_inode *ui = ubifs_inode(inode);
+
+ ubifs_assert(mutex_is_locked(&ui->ui_mutex));
+ if (!ui->dirty) {
+ ui->dirty = 1;
+ dbg_gen("inode %lu", inode->i_ino);
+ }
+}
+
+static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ struct ubifs_info *c = dentry->d_sb->s_fs_info;
+ unsigned long long free;
+ __le32 *uuid = (__le32 *)c->uuid;
+
+ free = ubifs_get_free_space(c);
+ dbg_gen("free space %lld bytes (%lld blocks)",
+ free, free >> UBIFS_BLOCK_SHIFT);
+
+ buf->f_type = UBIFS_SUPER_MAGIC;
+ buf->f_bsize = UBIFS_BLOCK_SIZE;
+ buf->f_blocks = c->block_cnt;
+ buf->f_bfree = free >> UBIFS_BLOCK_SHIFT;
+ if (free > c->report_rp_size)
+ buf->f_bavail = (free - c->report_rp_size) >> UBIFS_BLOCK_SHIFT;
+ else
+ buf->f_bavail = 0;
+ buf->f_files = 0;
+ buf->f_ffree = 0;
+ buf->f_namelen = UBIFS_MAX_NLEN;
+ buf->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[2]);
+ buf->f_fsid.val[1] = le32_to_cpu(uuid[1]) ^ le32_to_cpu(uuid[3]);
+ return 0;
+}
+
+static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt)
+{
+ struct ubifs_info *c = mnt->mnt_sb->s_fs_info;
+
+ if (c->mount_opts.unmount_mode == 2)
+ seq_printf(s, ",fast_unmount");
+ else if (c->mount_opts.unmount_mode == 1)
+ seq_printf(s, ",norm_unmount");
+
+ return 0;
+}
+
+static int ubifs_sync_fs(struct super_block *sb, int wait)
+{
+ struct ubifs_info *c = sb->s_fs_info;
+ int i, ret = 0, err;
+
+ if (c->jheads)
+ for (i = 0; i < c->jhead_cnt; i++) {
+ err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
+ if (err && !ret)
+ ret = err;
+ }
+ /*
+ * We ought to call sync for c->ubi but it does not have one. If it had
+ * it would in turn call mtd->sync, however mtd operations are
+ * synchronous anyway, so we don't lose any sleep here.
+ */
+ return ret;
+}
+
+/**
+ * init_constants_early - initialize UBIFS constants.
+ * @c: UBIFS file-system description object
+ *
+ * This function initialize UBIFS constants which do not need the superblock to
+ * be read. It also checks that the UBI volume satisfies basic UBIFS
+ * requirements. Returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int init_constants_early(struct ubifs_info *c)
+{
+ if (c->vi.corrupted) {
+ ubifs_warn("UBI volume is corrupted - read-only mode");
+ c->ro_media = 1;
+ }
+
+ if (c->di.ro_mode) {
+ ubifs_msg("read-only UBI device");
+ c->ro_media = 1;
+ }
+
+ if (c->vi.vol_type == UBI_STATIC_VOLUME) {
+ ubifs_msg("static UBI volume - read-only mode");
+ c->ro_media = 1;
+ }
+
+ c->leb_cnt = c->vi.size;
+ c->leb_size = c->vi.usable_leb_size;
+ c->half_leb_size = c->leb_size / 2;
+ c->min_io_size = c->di.min_io_size;
+ c->min_io_shift = fls(c->min_io_size) - 1;
+
+ if (c->leb_size < UBIFS_MIN_LEB_SZ) {
+ ubifs_err("too small LEBs (%d bytes), min. is %d bytes",
+ c->leb_size, UBIFS_MIN_LEB_SZ);
+ return -EINVAL;
+ }
+
+ if (c->leb_cnt < UBIFS_MIN_LEB_CNT) {
+ ubifs_err("too few LEBs (%d), min. is %d",
+ c->leb_cnt, UBIFS_MIN_LEB_CNT);
+ return -EINVAL;
+ }
+
+ if (!is_power_of_2(c->min_io_size)) {
+ ubifs_err("bad min. I/O size %d", c->min_io_size);
+ return -EINVAL;
+ }
+
+ /*
+ * UBIFS aligns all node to 8-byte boundary, so to make function in
+ * io.c simpler, assume minimum I/O unit size to be 8 bytes if it is
+ * less than 8.
+ */
+ if (c->min_io_size < 8) {
+ c->min_io_size = 8;
+ c->min_io_shift = 3;
+ }
+
+ c->ref_node_alsz = ALIGN(UBIFS_REF_NODE_SZ, c->min_io_size);
+ c->mst_node_alsz = ALIGN(UBIFS_MST_NODE_SZ, c->min_io_size);
+
+ /*
+ * Initialize node length ranges which are mostly needed for node
+ * length validation.
+ */
+ c->ranges[UBIFS_PAD_NODE].len = UBIFS_PAD_NODE_SZ;
+ c->ranges[UBIFS_SB_NODE].len = UBIFS_SB_NODE_SZ;
+ c->ranges[UBIFS_MST_NODE].len = UBIFS_MST_NODE_SZ;
+ c->ranges[UBIFS_REF_NODE].len = UBIFS_REF_NODE_SZ;
+ c->ranges[UBIFS_TRUN_NODE].len = UBIFS_TRUN_NODE_SZ;
+ c->ranges[UBIFS_CS_NODE].len = UBIFS_CS_NODE_SZ;
+
+ c->ranges[UBIFS_INO_NODE].min_len = UBIFS_INO_NODE_SZ;
+ c->ranges[UBIFS_INO_NODE].max_len = UBIFS_MAX_INO_NODE_SZ;
+ c->ranges[UBIFS_ORPH_NODE].min_len =
+ UBIFS_ORPH_NODE_SZ + sizeof(__le64);
+ c->ranges[UBIFS_ORPH_NODE].max_len = c->leb_size;
+ c->ranges[UBIFS_DENT_NODE].min_len = UBIFS_DENT_NODE_SZ;
+ c->ranges[UBIFS_DENT_NODE].max_len = UBIFS_MAX_DENT_NODE_SZ;
+ c->ranges[UBIFS_XENT_NODE].min_len = UBIFS_XENT_NODE_SZ;
+ c->ranges[UBIFS_XENT_NODE].max_len = UBIFS_MAX_XENT_NODE_SZ;
+ c->ranges[UBIFS_DATA_NODE].min_len = UBIFS_DATA_NODE_SZ;
+ c->ranges[UBIFS_DATA_NODE].max_len = UBIFS_MAX_DATA_NODE_SZ;
+ /*
+ * Minimum indexing node size is amended later when superblock is
+ * read and the key length is known.
+ */
+ c->ranges[UBIFS_IDX_NODE].min_len = UBIFS_IDX_NODE_SZ + UBIFS_BRANCH_SZ;
+ /*
+ * Maximum indexing node size is amended later when superblock is
+ * read and the fanout is known.
+ */
+ c->ranges[UBIFS_IDX_NODE].max_len = INT_MAX;
+
+ /*
+ * Initialize dead and dark LEB space watermarks.
+ *
+ * Dead space is the space which cannot be used. Its watermark is
+ * equivalent to min. I/O unit or minimum node size if it is greater
+ * then min. I/O unit.
+ *
+ * Dark space is the space which might be used, or might not, depending
+ * on which node should be written to the LEB. Its watermark is
+ * equivalent to maximum UBIFS node size.
+ */
+ c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size);
+ c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size);
+
+ /*
+ * Calculate how many bytes would be wasted at the end of LEB if it was
+ * fully filled with data nodes of maximum size. This is used in
+ * calculations when reporting free space.
+ */
+ c->leb_overhead = c->leb_size % UBIFS_MAX_DATA_NODE_SZ;
+ return 0;
+}
+
+/**
+ * bud_wbuf_callback - bud LEB write-buffer synchronization call-back.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB the write-buffer was synchronized to
+ * @free: how many free bytes left in this LEB
+ * @pad: how many bytes were padded
+ *
+ * This is a callback function which is called by the I/O unit when the
+ * write-buffer is synchronized. We need this to correctly maintain space
+ * accounting in bud logical eraseblocks. This function returns zero in case of
+ * success and a negative error code in case of failure.
+ *
+ * This function actually belongs to the journal, but we keep it here because
+ * we want to keep it static.
+ */
+static int bud_wbuf_callback(struct ubifs_info *c, int lnum, int free, int pad)
+{
+ return ubifs_update_one_lp(c, lnum, free, pad, 0, 0);
+}
+
+/*
+ * init_constants_late - initialize UBIFS constants.
+ * @c: UBIFS file-system description object
+ *
+ * This is a helper function which initializes various UBIFS constants after
+ * the superblock has been read. It also checks various UBIFS parameters and
+ * makes sure they are all right. Returns zero in case of success and a
+ * negative error code in case of failure.
+ */
+static int init_constants_late(struct ubifs_info *c)
+{
+ int tmp, err;
+ uint64_t tmp64;
+
+ c->main_bytes = (long long)c->main_lebs * c->leb_size;
+ c->max_znode_sz = sizeof(struct ubifs_znode) +
+ c->fanout * sizeof(struct ubifs_zbranch);
+
+ tmp = ubifs_idx_node_sz(c, 1);
+ c->ranges[UBIFS_IDX_NODE].min_len = tmp;
+ c->min_idx_node_sz = ALIGN(tmp, 8);
+
+ tmp = ubifs_idx_node_sz(c, c->fanout);
+ c->ranges[UBIFS_IDX_NODE].max_len = tmp;
+ c->max_idx_node_sz = ALIGN(tmp, 8);
+
+ /* Make sure LEB size is large enough to fit full commit */
+ tmp = UBIFS_CS_NODE_SZ + UBIFS_REF_NODE_SZ * c->jhead_cnt;
+ tmp = ALIGN(tmp, c->min_io_size);
+ if (tmp > c->leb_size) {
+ dbg_err("too small LEB size %d, at least %d needed",
+ c->leb_size, tmp);
+ return -EINVAL;
+ }
+
+ /*
+ * Make sure that the log is large enough to fit reference nodes for
+ * all buds plus one reserved LEB.
+ */
+ tmp64 = c->max_bud_bytes;
+ tmp = do_div(tmp64, c->leb_size);
+ c->max_bud_cnt = tmp64 + !!tmp;
+ tmp = (c->ref_node_alsz * c->max_bud_cnt + c->leb_size - 1);
+ tmp /= c->leb_size;
+ tmp += 1;
+ if (c->log_lebs < tmp) {
+ dbg_err("too small log %d LEBs, required min. %d LEBs",
+ c->log_lebs, tmp);
+ return -EINVAL;
+ }
+
+ /*
+ * When budgeting we assume worst-case scenarios when the pages are not
+ * be compressed and direntries are of the maximum size.
+ *
+ * Note, data, which may be stored in inodes is budgeted separately, so
+ * it is not included into 'c->inode_budget'.
+ */
+ c->page_budget = UBIFS_MAX_DATA_NODE_SZ * UBIFS_BLOCKS_PER_PAGE;
+ c->inode_budget = UBIFS_INO_NODE_SZ;
+ c->dent_budget = UBIFS_MAX_DENT_NODE_SZ;
+
+ /*
+ * When the amount of flash space used by buds becomes
+ * 'c->max_bud_bytes', UBIFS just blocks all writers and starts commit.
+ * The writers are unblocked when the commit is finished. To avoid
+ * writers to be blocked UBIFS initiates background commit in advance,
+ * when number of bud bytes becomes above the limit defined below.
+ */
+ c->bg_bud_bytes = (c->max_bud_bytes * 13) >> 4;
+
+ /*
+ * Ensure minimum journal size. All the bytes in the journal heads are
+ * considered to be used, when calculating the current journal usage.
+ * Consequently, if the journal is too small, UBIFS will treat it as
+ * always full.
+ */
+ tmp64 = (uint64_t)(c->jhead_cnt + 1) * c->leb_size + 1;
+ if (c->bg_bud_bytes < tmp64)
+ c->bg_bud_bytes = tmp64;
+ if (c->max_bud_bytes < tmp64 + c->leb_size)
+ c->max_bud_bytes = tmp64 + c->leb_size;
+
+ err = ubifs_calc_lpt_geom(c);
+ if (err)
+ return err;
+
+ c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+
+ /*
+ * Calculate total amount of FS blocks. This number is not used
+ * internally because it does not make much sense for UBIFS, but it is
+ * necessary to report something for the 'statfs()' call.
+ *
+ * Subtract the LEB reserved for GC, the LEB which is reserved for
+ * deletions, and assume only one journal head is available.
+ */
+ tmp64 = c->main_lebs - 2 - c->jhead_cnt + 1;
+ tmp64 *= (uint64_t)c->leb_size - c->leb_overhead;
+ tmp64 = ubifs_reported_space(c, tmp64);
+ c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT;
+
+ return 0;
+}
+
+/**
+ * take_gc_lnum - reserve GC LEB.
+ * @c: UBIFS file-system description object
+ *
+ * This function ensures that the LEB reserved for garbage collection is
+ * unmapped and is marked as "taken" in lprops. We also have to set free space
+ * to LEB size and dirty space to zero, because lprops may contain out-of-date
+ * information if the file-system was un-mounted before it has been committed.
+ * This function returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+static int take_gc_lnum(struct ubifs_info *c)
+{
+ int err;
+
+ if (c->gc_lnum == -1) {
+ ubifs_err("no LEB for GC");
+ return -EINVAL;
+ }
+
+ err = ubifs_leb_unmap(c, c->gc_lnum);
+ if (err)
+ return err;
+
+ /* And we have to tell lprops that this LEB is taken */
+ err = ubifs_change_one_lp(c, c->gc_lnum, c->leb_size, 0,
+ LPROPS_TAKEN, 0, 0);
+ return err;
+}
+
+/**
+ * alloc_wbufs - allocate write-buffers.
+ * @c: UBIFS file-system description object
+ *
+ * This helper function allocates and initializes UBIFS write-buffers. Returns
+ * zero in case of success and %-ENOMEM in case of failure.
+ */
+static int alloc_wbufs(struct ubifs_info *c)
+{
+ int i, err;
+
+ c->jheads = kzalloc(c->jhead_cnt * sizeof(struct ubifs_jhead),
+ GFP_KERNEL);
+ if (!c->jheads)
+ return -ENOMEM;
+
+ /* Initialize journal heads */
+ for (i = 0; i < c->jhead_cnt; i++) {
+ INIT_LIST_HEAD(&c->jheads[i].buds_list);
+ err = ubifs_wbuf_init(c, &c->jheads[i].wbuf);
+ if (err)
+ return err;
+
+ c->jheads[i].wbuf.sync_callback = &bud_wbuf_callback;
+ c->jheads[i].wbuf.jhead = i;
+ }
+
+ c->jheads[BASEHD].wbuf.dtype = UBI_SHORTTERM;
+ /*
+ * Garbage Collector head likely contains long-term data and
+ * does not need to be synchronized by timer.
+ */
+ c->jheads[GCHD].wbuf.dtype = UBI_LONGTERM;
+ c->jheads[GCHD].wbuf.timeout = 0;
+
+ return 0;
+}
+
+/**
+ * free_wbufs - free write-buffers.
+ * @c: UBIFS file-system description object
+ */
+static void free_wbufs(struct ubifs_info *c)
+{
+ int i;
+
+ if (c->jheads) {
+ for (i = 0; i < c->jhead_cnt; i++) {
+ kfree(c->jheads[i].wbuf.buf);
+ kfree(c->jheads[i].wbuf.inodes);
+ }
+ kfree(c->jheads);
+ c->jheads = NULL;
+ }
+}
+
+/**
+ * free_orphans - free orphans.
+ * @c: UBIFS file-system description object
+ */
+static void free_orphans(struct ubifs_info *c)
+{
+ struct ubifs_orphan *orph;
+
+ while (c->orph_dnext) {
+ orph = c->orph_dnext;
+ c->orph_dnext = orph->dnext;
+ list_del(&orph->list);
+ kfree(orph);
+ }
+
+ while (!list_empty(&c->orph_list)) {
+ orph = list_entry(c->orph_list.next, struct ubifs_orphan, list);
+ list_del(&orph->list);
+ kfree(orph);
+ dbg_err("orphan list not empty at unmount");
+ }
+
+ vfree(c->orph_buf);
+ c->orph_buf = NULL;
+}
+
+/**
+ * free_buds - free per-bud objects.
+ * @c: UBIFS file-system description object
+ */
+static void free_buds(struct ubifs_info *c)
+{
+ struct rb_node *this = c->buds.rb_node;
+ struct ubifs_bud *bud;
+
+ while (this) {
+ if (this->rb_left)
+ this = this->rb_left;
+ else if (this->rb_right)
+ this = this->rb_right;
+ else {
+ bud = rb_entry(this, struct ubifs_bud, rb);
+ this = rb_parent(this);
+ if (this) {
+ if (this->rb_left == &bud->rb)
+ this->rb_left = NULL;
+ else
+ this->rb_right = NULL;
+ }
+ kfree(bud);
+ }
+ }
+}
+
+/**
+ * check_volume_empty - check if the UBI volume is empty.
+ * @c: UBIFS file-system description object
+ *
+ * This function checks if the UBIFS volume is empty by looking if its LEBs are
+ * mapped or not. The result of checking is stored in the @c->empty variable.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+static int check_volume_empty(struct ubifs_info *c)
+{
+ int lnum, err;
+
+ c->empty = 1;
+ for (lnum = 0; lnum < c->leb_cnt; lnum++) {
+ err = ubi_is_mapped(c->ubi, lnum);
+ if (unlikely(err < 0))
+ return err;
+ if (err == 1) {
+ c->empty = 0;
+ break;
+ }
+
+ cond_resched();
+ }
+
+ return 0;
+}
+
+/*
+ * UBIFS mount options.
+ *
+ * Opt_fast_unmount: do not run a journal commit before un-mounting
+ * Opt_norm_unmount: run a journal commit before un-mounting
+ * Opt_err: just end of array marker
+ */
+enum {
+ Opt_fast_unmount,
+ Opt_norm_unmount,
+ Opt_err,
+};
+
+static const match_table_t tokens = {
+ {Opt_fast_unmount, "fast_unmount"},
+ {Opt_norm_unmount, "norm_unmount"},
+ {Opt_err, NULL},
+};
+
+/**
+ * ubifs_parse_options - parse mount parameters.
+ * @c: UBIFS file-system description object
+ * @options: parameters to parse
+ * @is_remount: non-zero if this is FS re-mount
+ *
+ * This function parses UBIFS mount options and returns zero in case success
+ * and a negative error code in case of failure.
+ */
+static int ubifs_parse_options(struct ubifs_info *c, char *options,
+ int is_remount)
+{
+ char *p;
+ substring_t args[MAX_OPT_ARGS];
+
+ if (!options)
+ return 0;
+
+ while ((p = strsep(&options, ","))) {
+ int token;
+
+ if (!*p)
+ continue;
+
+ token = match_token(p, tokens, args);
+ switch (token) {
+ case Opt_fast_unmount:
+ c->mount_opts.unmount_mode = 2;
+ c->fast_unmount = 1;
+ break;
+ case Opt_norm_unmount:
+ c->mount_opts.unmount_mode = 1;
+ c->fast_unmount = 0;
+ break;
+ default:
+ ubifs_err("unrecognized mount option \"%s\" "
+ "or missing value", p);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * destroy_journal - destroy journal data structures.
+ * @c: UBIFS file-system description object
+ *
+ * This function destroys journal data structures including those that may have
+ * been created by recovery functions.
+ */
+static void destroy_journal(struct ubifs_info *c)
+{
+ while (!list_empty(&c->unclean_leb_list)) {
+ struct ubifs_unclean_leb *ucleb;
+
+ ucleb = list_entry(c->unclean_leb_list.next,
+ struct ubifs_unclean_leb, list);
+ list_del(&ucleb->list);
+ kfree(ucleb);
+ }
+ while (!list_empty(&c->old_buds)) {
+ struct ubifs_bud *bud;
+
+ bud = list_entry(c->old_buds.next, struct ubifs_bud, list);
+ list_del(&bud->list);
+ kfree(bud);
+ }
+ ubifs_destroy_idx_gc(c);
+ ubifs_destroy_size_tree(c);
+ ubifs_tnc_close(c);
+ free_buds(c);
+}
+
+/**
+ * mount_ubifs - mount UBIFS file-system.
+ * @c: UBIFS file-system description object
+ *
+ * This function mounts UBIFS file system. Returns zero in case of success and
+ * a negative error code in case of failure.
+ *
+ * Note, the function does not de-allocate resources it it fails half way
+ * through, and the caller has to do this instead.
+ */
+static int mount_ubifs(struct ubifs_info *c)
+{
+ struct super_block *sb = c->vfs_sb;
+ int err, mounted_read_only = (sb->s_flags & MS_RDONLY);
+ long long x;
+ size_t sz;
+
+ err = init_constants_early(c);
+ if (err)
+ return err;
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+ c->dbg_buf = vmalloc(c->leb_size);
+ if (!c->dbg_buf)
+ return -ENOMEM;
+#endif
+
+ err = check_volume_empty(c);
+ if (err)
+ goto out_free;
+
+ if (c->empty && (mounted_read_only || c->ro_media)) {
+ /*
+ * This UBI volume is empty, and read-only, or the file system
+ * is mounted read-only - we cannot format it.
+ */
+ ubifs_err("can't format empty UBI volume: read-only %s",
+ c->ro_media ? "UBI volume" : "mount");
+ err = -EROFS;
+ goto out_free;
+ }
+
+ if (c->ro_media && !mounted_read_only) {
+ ubifs_err("cannot mount read-write - read-only media");
+ err = -EROFS;
+ goto out_free;
+ }
+
+ /*
+ * The requirement for the buffer is that it should fit indexing B-tree
+ * height amount of integers. We assume the height if the TNC tree will
+ * never exceed 64.
+ */
+ err = -ENOMEM;
+ c->bottom_up_buf = kmalloc(BOTTOM_UP_HEIGHT * sizeof(int), GFP_KERNEL);
+ if (!c->bottom_up_buf)
+ goto out_free;
+
+ c->sbuf = vmalloc(c->leb_size);
+ if (!c->sbuf)
+ goto out_free;
+
+ if (!mounted_read_only) {
+ c->ileb_buf = vmalloc(c->leb_size);
+ if (!c->ileb_buf)
+ goto out_free;
+ }
+
+ err = ubifs_read_superblock(c);
+ if (err)
+ goto out_free;
+
+ /*
+ * Make sure the compressor which is set as the default on in the
+ * superblock was actually compiled in.
+ */
+ if (!ubifs_compr_present(c->default_compr)) {
+ ubifs_warn("'%s' compressor is set by superblock, but not "
+ "compiled in", ubifs_compr_name(c->default_compr));
+ c->default_compr = UBIFS_COMPR_NONE;
+ }
+
+ dbg_failure_mode_registration(c);
+
+ err = init_constants_late(c);
+ if (err)
+ goto out_dereg;
+
+ sz = ALIGN(c->max_idx_node_sz, c->min_io_size);
+ sz = ALIGN(sz + c->max_idx_node_sz, c->min_io_size);
+ c->cbuf = kmalloc(sz, GFP_NOFS);
+ if (!c->cbuf) {
+ err = -ENOMEM;
+ goto out_dereg;
+ }
+
+ sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id);
+ if (!mounted_read_only) {
+ err = alloc_wbufs(c);
+ if (err)
+ goto out_cbuf;
+
+ /* Create background thread */
+ c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name);
+ if (!c->bgt)
+ c->bgt = ERR_PTR(-EINVAL);
+ if (IS_ERR(c->bgt)) {
+ err = PTR_ERR(c->bgt);
+ c->bgt = NULL;
+ ubifs_err("cannot spawn \"%s\", error %d",
+ c->bgt_name, err);
+ goto out_wbufs;
+ }
+ wake_up_process(c->bgt);
+ }
+
+ err = ubifs_read_master(c);
+ if (err)
+ goto out_master;
+
+ if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) {
+ ubifs_msg("recovery needed");
+ c->need_recovery = 1;
+ if (!mounted_read_only) {
+ err = ubifs_recover_inl_heads(c, c->sbuf);
+ if (err)
+ goto out_master;
+ }
+ } else if (!mounted_read_only) {
+ /*
+ * Set the "dirty" flag so that if we reboot uncleanly we
+ * will notice this immediately on the next mount.
+ */
+ c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
+ err = ubifs_write_master(c);
+ if (err)
+ goto out_master;
+ }
+
+ err = ubifs_lpt_init(c, 1, !mounted_read_only);
+ if (err)
+ goto out_lpt;
+
+ err = dbg_check_idx_size(c, c->old_idx_sz);
+ if (err)
+ goto out_lpt;
+
+ err = ubifs_replay_journal(c);
+ if (err)
+ goto out_journal;
+
+ err = ubifs_mount_orphans(c, c->need_recovery, mounted_read_only);
+ if (err)
+ goto out_orphans;
+
+ if (!mounted_read_only) {
+ int lnum;
+
+ /* Check for enough free space */
+ if (ubifs_calc_available(c, c->min_idx_lebs) <= 0) {
+ ubifs_err("insufficient available space");
+ err = -EINVAL;
+ goto out_orphans;
+ }
+
+ /* Check for enough log space */
+ lnum = c->lhead_lnum + 1;
+ if (lnum >= UBIFS_LOG_LNUM + c->log_lebs)
+ lnum = UBIFS_LOG_LNUM;
+ if (lnum == c->ltail_lnum) {
+ err = ubifs_consolidate_log(c);
+ if (err)
+ goto out_orphans;
+ }
+
+ if (c->need_recovery) {
+ err = ubifs_recover_size(c);
+ if (err)
+ goto out_orphans;
+ err = ubifs_rcvry_gc_commit(c);
+ } else
+ err = take_gc_lnum(c);
+ if (err)
+ goto out_orphans;
+
+ err = dbg_check_lprops(c);
+ if (err)
+ goto out_orphans;
+ } else if (c->need_recovery) {
+ err = ubifs_recover_size(c);
+ if (err)
+ goto out_orphans;
+ }
+
+ spin_lock(&ubifs_infos_lock);
+ list_add_tail(&c->infos_list, &ubifs_infos);
+ spin_unlock(&ubifs_infos_lock);
+
+ if (c->need_recovery) {
+ if (mounted_read_only)
+ ubifs_msg("recovery deferred");
+ else {
+ c->need_recovery = 0;
+ ubifs_msg("recovery completed");
+ }
+ }
+
+ err = dbg_check_filesystem(c);
+ if (err)
+ goto out_infos;
+
+ ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"",
+ c->vi.ubi_num, c->vi.vol_id, c->vi.name);
+ if (mounted_read_only)
+ ubifs_msg("mounted read-only");
+ x = (long long)c->main_lebs * c->leb_size;
+ ubifs_msg("file system size: %lld bytes (%lld KiB, %lld MiB, %d LEBs)",
+ x, x >> 10, x >> 20, c->main_lebs);
+ x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes;
+ ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d LEBs)",
+ x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt);
+ ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr));
+ ubifs_msg("media format %d, latest format %d",
+ c->fmt_version, UBIFS_FORMAT_VERSION);
+
+ dbg_msg("compiled on: " __DATE__ " at " __TIME__);
+ dbg_msg("min. I/O unit size: %d bytes", c->min_io_size);
+ dbg_msg("LEB size: %d bytes (%d KiB)",
+ c->leb_size, c->leb_size / 1024);
+ dbg_msg("data journal heads: %d",
+ c->jhead_cnt - NONDATA_JHEADS_CNT);
+ dbg_msg("UUID: %02X%02X%02X%02X-%02X%02X"
+ "-%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X",
+ c->uuid[0], c->uuid[1], c->uuid[2], c->uuid[3],
+ c->uuid[4], c->uuid[5], c->uuid[6], c->uuid[7],
+ c->uuid[8], c->uuid[9], c->uuid[10], c->uuid[11],
+ c->uuid[12], c->uuid[13], c->uuid[14], c->uuid[15]);
+ dbg_msg("fast unmount: %d", c->fast_unmount);
+ dbg_msg("big_lpt %d", c->big_lpt);
+ dbg_msg("log LEBs: %d (%d - %d)",
+ c->log_lebs, UBIFS_LOG_LNUM, c->log_last);
+ dbg_msg("LPT area LEBs: %d (%d - %d)",
+ c->lpt_lebs, c->lpt_first, c->lpt_last);
+ dbg_msg("orphan area LEBs: %d (%d - %d)",
+ c->orph_lebs, c->orph_first, c->orph_last);
+ dbg_msg("main area LEBs: %d (%d - %d)",
+ c->main_lebs, c->main_first, c->leb_cnt - 1);
+ dbg_msg("index LEBs: %d", c->lst.idx_lebs);
+ dbg_msg("total index bytes: %lld (%lld KiB, %lld MiB)",
+ c->old_idx_sz, c->old_idx_sz >> 10, c->old_idx_sz >> 20);
+ dbg_msg("key hash type: %d", c->key_hash_type);
+ dbg_msg("tree fanout: %d", c->fanout);
+ dbg_msg("reserved GC LEB: %d", c->gc_lnum);
+ dbg_msg("first main LEB: %d", c->main_first);
+ dbg_msg("dead watermark: %d", c->dead_wm);
+ dbg_msg("dark watermark: %d", c->dark_wm);
+ x = (long long)c->main_lebs * c->dark_wm;
+ dbg_msg("max. dark space: %lld (%lld KiB, %lld MiB)",
+ x, x >> 10, x >> 20);
+ dbg_msg("maximum bud bytes: %lld (%lld KiB, %lld MiB)",
+ c->max_bud_bytes, c->max_bud_bytes >> 10,
+ c->max_bud_bytes >> 20);
+ dbg_msg("BG commit bud bytes: %lld (%lld KiB, %lld MiB)",
+ c->bg_bud_bytes, c->bg_bud_bytes >> 10,
+ c->bg_bud_bytes >> 20);
+ dbg_msg("current bud bytes %lld (%lld KiB, %lld MiB)",
+ c->bud_bytes, c->bud_bytes >> 10, c->bud_bytes >> 20);
+ dbg_msg("max. seq. number: %llu", c->max_sqnum);
+ dbg_msg("commit number: %llu", c->cmt_no);
+
+ return 0;
+
+out_infos:
+ spin_lock(&ubifs_infos_lock);
+ list_del(&c->infos_list);
+ spin_unlock(&ubifs_infos_lock);
+out_orphans:
+ free_orphans(c);
+out_journal:
+ destroy_journal(c);
+out_lpt:
+ ubifs_lpt_free(c, 0);
+out_master:
+ kfree(c->mst_node);
+ kfree(c->rcvrd_mst_node);
+ if (c->bgt)
+ kthread_stop(c->bgt);
+out_wbufs:
+ free_wbufs(c);
+out_cbuf:
+ kfree(c->cbuf);
+out_dereg:
+ dbg_failure_mode_deregistration(c);
+out_free:
+ vfree(c->ileb_buf);
+ vfree(c->sbuf);
+ kfree(c->bottom_up_buf);
+ UBIFS_DBG(vfree(c->dbg_buf));
+ return err;
+}
+
+/**
+ * ubifs_umount - un-mount UBIFS file-system.
+ * @c: UBIFS file-system description object
+ *
+ * Note, this function is called to free allocated resourced when un-mounting,
+ * as well as free resources when an error occurred while we were half way
+ * through mounting (error path cleanup function). So it has to make sure the
+ * resource was actually allocated before freeing it.
+ */
+static void ubifs_umount(struct ubifs_info *c)
+{
+ dbg_gen("un-mounting UBI device %d, volume %d", c->vi.ubi_num,
+ c->vi.vol_id);
+
+ spin_lock(&ubifs_infos_lock);
+ list_del(&c->infos_list);
+ spin_unlock(&ubifs_infos_lock);
+
+ if (c->bgt)
+ kthread_stop(c->bgt);
+
+ destroy_journal(c);
+ free_wbufs(c);
+ free_orphans(c);
+ ubifs_lpt_free(c, 0);
+
+ kfree(c->cbuf);
+ kfree(c->rcvrd_mst_node);
+ kfree(c->mst_node);
+ vfree(c->sbuf);
+ kfree(c->bottom_up_buf);
+ UBIFS_DBG(vfree(c->dbg_buf));
+ vfree(c->ileb_buf);
+ dbg_failure_mode_deregistration(c);
+}
+
+/**
+ * ubifs_remount_rw - re-mount in read-write mode.
+ * @c: UBIFS file-system description object
+ *
+ * UBIFS avoids allocating many unnecessary resources when mounted in read-only
+ * mode. This function allocates the needed resources and re-mounts UBIFS in
+ * read-write mode.
+ */
+static int ubifs_remount_rw(struct ubifs_info *c)
+{
+ int err, lnum;
+
+ if (c->ro_media)
+ return -EINVAL;
+
+ mutex_lock(&c->umount_mutex);
+ c->remounting_rw = 1;
+
+ /* Check for enough free space */
+ if (ubifs_calc_available(c, c->min_idx_lebs) <= 0) {
+ ubifs_err("insufficient available space");
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (c->old_leb_cnt != c->leb_cnt) {
+ struct ubifs_sb_node *sup;
+
+ sup = ubifs_read_sb_node(c);
+ if (IS_ERR(sup)) {
+ err = PTR_ERR(sup);
+ goto out;
+ }
+ sup->leb_cnt = cpu_to_le32(c->leb_cnt);
+ err = ubifs_write_sb_node(c, sup);
+ if (err)
+ goto out;
+ }
+
+ if (c->need_recovery) {
+ ubifs_msg("completing deferred recovery");
+ err = ubifs_write_rcvrd_mst_node(c);
+ if (err)
+ goto out;
+ err = ubifs_recover_size(c);
+ if (err)
+ goto out;
+ err = ubifs_clean_lebs(c, c->sbuf);
+ if (err)
+ goto out;
+ err = ubifs_recover_inl_heads(c, c->sbuf);
+ if (err)
+ goto out;
+ }
+
+ if (!(c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY))) {
+ c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);
+ err = ubifs_write_master(c);
+ if (err)
+ goto out;
+ }
+
+ c->ileb_buf = vmalloc(c->leb_size);
+ if (!c->ileb_buf) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = ubifs_lpt_init(c, 0, 1);
+ if (err)
+ goto out;
+
+ err = alloc_wbufs(c);
+ if (err)
+ goto out;
+
+ ubifs_create_buds_lists(c);
+
+ /* Create background thread */
+ c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name);
+ if (!c->bgt)
+ c->bgt = ERR_PTR(-EINVAL);
+ if (IS_ERR(c->bgt)) {
+ err = PTR_ERR(c->bgt);
+ c->bgt = NULL;
+ ubifs_err("cannot spawn \"%s\", error %d",
+ c->bgt_name, err);
+ return err;
+ }
+ wake_up_process(c->bgt);
+
+ c->orph_buf = vmalloc(c->leb_size);
+ if (!c->orph_buf)
+ return -ENOMEM;
+
+ /* Check for enough log space */
+ lnum = c->lhead_lnum + 1;
+ if (lnum >= UBIFS_LOG_LNUM + c->log_lebs)
+ lnum = UBIFS_LOG_LNUM;
+ if (lnum == c->ltail_lnum) {
+ err = ubifs_consolidate_log(c);
+ if (err)
+ goto out;
+ }
+
+ if (c->need_recovery)
+ err = ubifs_rcvry_gc_commit(c);
+ else
+ err = take_gc_lnum(c);
+ if (err)
+ goto out;
+
+ if (c->need_recovery) {
+ c->need_recovery = 0;
+ ubifs_msg("deferred recovery completed");
+ }
+
+ dbg_gen("re-mounted read-write");
+ c->vfs_sb->s_flags &= ~MS_RDONLY;
+ c->remounting_rw = 0;
+ mutex_unlock(&c->umount_mutex);
+ return 0;
+
+out:
+ vfree(c->orph_buf);
+ c->orph_buf = NULL;
+ if (c->bgt) {
+ kthread_stop(c->bgt);
+ c->bgt = NULL;
+ }
+ free_wbufs(c);
+ vfree(c->ileb_buf);
+ c->ileb_buf = NULL;
+ ubifs_lpt_free(c, 1);
+ c->remounting_rw = 0;
+ mutex_unlock(&c->umount_mutex);
+ return err;
+}
+
+/**
+ * commit_on_unmount - commit the journal when un-mounting.
+ * @c: UBIFS file-system description object
+ *
+ * This function is called during un-mounting and it commits the journal unless
+ * the "fast unmount" mode is enabled. It also avoids committing the journal if
+ * it contains too few data.
+ *
+ * Sometimes recovery requires the journal to be committed at least once, and
+ * this function takes care about this.
+ */
+static void commit_on_unmount(struct ubifs_info *c)
+{
+ if (!c->fast_unmount) {
+ long long bud_bytes;
+
+ spin_lock(&c->buds_lock);
+ bud_bytes = c->bud_bytes;
+ spin_unlock(&c->buds_lock);
+ if (bud_bytes > c->leb_size)
+ ubifs_run_commit(c);
+ }
+}
+
+/**
+ * ubifs_remount_ro - re-mount in read-only mode.
+ * @c: UBIFS file-system description object
+ *
+ * We rely on VFS to have stopped writing. Possibly the background thread could
+ * be running a commit, however kthread_stop will wait in that case.
+ */
+static void ubifs_remount_ro(struct ubifs_info *c)
+{
+ int i, err;
+
+ ubifs_assert(!c->need_recovery);
+ commit_on_unmount(c);
+
+ mutex_lock(&c->umount_mutex);
+ if (c->bgt) {
+ kthread_stop(c->bgt);
+ c->bgt = NULL;
+ }
+
+ for (i = 0; i < c->jhead_cnt; i++) {
+ ubifs_wbuf_sync(&c->jheads[i].wbuf);
+ del_timer_sync(&c->jheads[i].wbuf.timer);
+ }
+
+ if (!c->ro_media) {
+ c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
+ c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
+ c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum);
+ err = ubifs_write_master(c);
+ if (err)
+ ubifs_ro_mode(c, err);
+ }
+
+ ubifs_destroy_idx_gc(c);
+ free_wbufs(c);
+ vfree(c->orph_buf);
+ c->orph_buf = NULL;
+ vfree(c->ileb_buf);
+ c->ileb_buf = NULL;
+ ubifs_lpt_free(c, 1);
+ mutex_unlock(&c->umount_mutex);
+}
+
+static void ubifs_put_super(struct super_block *sb)
+{
+ int i;
+ struct ubifs_info *c = sb->s_fs_info;
+
+ ubifs_msg("un-mount UBI device %d, volume %d", c->vi.ubi_num,
+ c->vi.vol_id);
+ /*
+ * The following asserts are only valid if there has not been a failure
+ * of the media. For example, there will be dirty inodes if we failed
+ * to write them back because of I/O errors.
+ */
+ ubifs_assert(atomic_long_read(&c->dirty_pg_cnt) == 0);
+ ubifs_assert(c->budg_idx_growth == 0);
+ ubifs_assert(c->budg_dd_growth == 0);
+ ubifs_assert(c->budg_data_growth == 0);
+
+ /*
+ * The 'c->umount_lock' prevents races between UBIFS memory shrinker
+ * and file system un-mount. Namely, it prevents the shrinker from
+ * picking this superblock for shrinking - it will be just skipped if
+ * the mutex is locked.
+ */
+ mutex_lock(&c->umount_mutex);
+ if (!(c->vfs_sb->s_flags & MS_RDONLY)) {
+ /*
+ * First of all kill the background thread to make sure it does
+ * not interfere with un-mounting and freeing resources.
+ */
+ if (c->bgt) {
+ kthread_stop(c->bgt);
+ c->bgt = NULL;
+ }
+
+ /* Synchronize write-buffers */
+ if (c->jheads)
+ for (i = 0; i < c->jhead_cnt; i++) {
+ ubifs_wbuf_sync(&c->jheads[i].wbuf);
+ del_timer_sync(&c->jheads[i].wbuf.timer);
+ }
+
+ /*
+ * On fatal errors c->ro_media is set to 1, in which case we do
+ * not write the master node.
+ */
+ if (!c->ro_media) {
+ /*
+ * We are being cleanly unmounted which means the
+ * orphans were killed - indicate this in the master
+ * node. Also save the reserved GC LEB number.
+ */
+ int err;
+
+ c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
+ c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
+ c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum);
+ err = ubifs_write_master(c);
+ if (err)
+ /*
+ * Recovery will attempt to fix the master area
+ * next mount, so we just print a message and
+ * continue to unmount normally.
+ */
+ ubifs_err("failed to write master node, "
+ "error %d", err);
+ }
+ }
+
+ ubifs_umount(c);
+ bdi_destroy(&c->bdi);
+ ubi_close_volume(c->ubi);
+ mutex_unlock(&c->umount_mutex);
+ kfree(c);
+}
+
+static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+ int err;
+ struct ubifs_info *c = sb->s_fs_info;
+
+ dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, *flags);
+
+ err = ubifs_parse_options(c, data, 1);
+ if (err) {
+ ubifs_err("invalid or unknown remount parameter");
+ return err;
+ }
+ if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
+ err = ubifs_remount_rw(c);
+ if (err)
+ return err;
+ } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY))
+ ubifs_remount_ro(c);
+
+ return 0;
+}
+
+struct super_operations ubifs_super_operations = {
+ .alloc_inode = ubifs_alloc_inode,
+ .destroy_inode = ubifs_destroy_inode,
+ .put_super = ubifs_put_super,
+ .write_inode = ubifs_write_inode,
+ .delete_inode = ubifs_delete_inode,
+ .statfs = ubifs_statfs,
+ .dirty_inode = ubifs_dirty_inode,
+ .remount_fs = ubifs_remount_fs,
+ .show_options = ubifs_show_options,
+ .sync_fs = ubifs_sync_fs,
+};
+
+/**
+ * open_ubi - parse UBI device name string and open the UBI device.
+ * @name: UBI volume name
+ * @mode: UBI volume open mode
+ *
+ * There are several ways to specify UBI volumes when mounting UBIFS:
+ * o ubiX_Y - UBI device number X, volume Y;
+ * o ubiY - UBI device number 0, volume Y;
+ * o ubiX:NAME - mount UBI device X, volume with name NAME;
+ * o ubi:NAME - mount UBI device 0, volume with name NAME.
+ *
+ * Alternative '!' separator may be used instead of ':' (because some shells
+ * like busybox may interpret ':' as an NFS host name separator). This function
+ * returns ubi volume object in case of success and a negative error code in
+ * case of failure.
+ */
+static struct ubi_volume_desc *open_ubi(const char *name, int mode)
+{
+ int dev, vol;
+ char *endptr;
+
+ if (name[0] != 'u' || name[1] != 'b' || name[2] != 'i')
+ return ERR_PTR(-EINVAL);
+
+ /* ubi:NAME method */
+ if ((name[3] == ':' || name[3] == '!') && name[4] != '\0')
+ return ubi_open_volume_nm(0, name + 4, mode);
+
+ if (!isdigit(name[3]))
+ return ERR_PTR(-EINVAL);
+
+ dev = simple_strtoul(name + 3, &endptr, 0);
+
+ /* ubiY method */
+ if (*endptr == '\0')
+ return ubi_open_volume(0, dev, mode);
+
+ /* ubiX_Y method */
+ if (*endptr == '_' && isdigit(endptr[1])) {
+ vol = simple_strtoul(endptr + 1, &endptr, 0);
+ if (*endptr != '\0')
+ return ERR_PTR(-EINVAL);
+ return ubi_open_volume(dev, vol, mode);
+ }
+
+ /* ubiX:NAME method */
+ if ((*endptr == ':' || *endptr == '!') && endptr[1] != '\0')
+ return ubi_open_volume_nm(dev, ++endptr, mode);
+
+ return ERR_PTR(-EINVAL);
+}
+
+static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
+{
+ struct ubi_volume_desc *ubi = sb->s_fs_info;
+ struct ubifs_info *c;
+ struct inode *root;
+ int err;
+
+ c = kzalloc(sizeof(struct ubifs_info), GFP_KERNEL);
+ if (!c)
+ return -ENOMEM;
+
+ spin_lock_init(&c->cnt_lock);
+ spin_lock_init(&c->cs_lock);
+ spin_lock_init(&c->buds_lock);
+ spin_lock_init(&c->space_lock);
+ spin_lock_init(&c->orphan_lock);
+ init_rwsem(&c->commit_sem);
+ mutex_init(&c->lp_mutex);
+ mutex_init(&c->tnc_mutex);
+ mutex_init(&c->log_mutex);
+ mutex_init(&c->mst_mutex);
+ mutex_init(&c->umount_mutex);
+ init_waitqueue_head(&c->cmt_wq);
+ c->buds = RB_ROOT;
+ c->old_idx = RB_ROOT;
+ c->size_tree = RB_ROOT;
+ c->orph_tree = RB_ROOT;
+ INIT_LIST_HEAD(&c->infos_list);
+ INIT_LIST_HEAD(&c->idx_gc);
+ INIT_LIST_HEAD(&c->replay_list);
+ INIT_LIST_HEAD(&c->replay_buds);
+ INIT_LIST_HEAD(&c->uncat_list);
+ INIT_LIST_HEAD(&c->empty_list);
+ INIT_LIST_HEAD(&c->freeable_list);
+ INIT_LIST_HEAD(&c->frdi_idx_list);
+ INIT_LIST_HEAD(&c->unclean_leb_list);
+ INIT_LIST_HEAD(&c->old_buds);
+ INIT_LIST_HEAD(&c->orph_list);
+ INIT_LIST_HEAD(&c->orph_new);
+
+ c->highest_inum = UBIFS_FIRST_INO;
+ c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM;
+
+ ubi_get_volume_info(ubi, &c->vi);
+ ubi_get_device_info(c->vi.ubi_num, &c->di);
+
+ /* Re-open the UBI device in read-write mode */
+ c->ubi = ubi_open_volume(c->vi.ubi_num, c->vi.vol_id, UBI_READWRITE);
+ if (IS_ERR(c->ubi)) {
+ err = PTR_ERR(c->ubi);
+ goto out_free;
+ }
+
+ /*
+ * UBIFS provides 'backing_dev_info' in order to disable read-ahead. For
+ * UBIFS, I/O is not deferred, it is done immediately in readpage,
+ * which means the user would have to wait not just for their own I/O
+ * but the read-ahead I/O as well i.e. completely pointless.
+ *
+ * Read-ahead will be disabled because @c->bdi.ra_pages is 0.
+ */
+ c->bdi.capabilities = BDI_CAP_MAP_COPY;
+ c->bdi.unplug_io_fn = default_unplug_io_fn;
+ err = bdi_init(&c->bdi);
+ if (err)
+ goto out_close;
+
+ err = ubifs_parse_options(c, data, 0);
+ if (err)
+ goto out_bdi;
+
+ c->vfs_sb = sb;
+
+ sb->s_fs_info = c;
+ sb->s_magic = UBIFS_SUPER_MAGIC;
+ sb->s_blocksize = UBIFS_BLOCK_SIZE;
+ sb->s_blocksize_bits = UBIFS_BLOCK_SHIFT;
+ sb->s_dev = c->vi.cdev;
+ sb->s_maxbytes = c->max_inode_sz = key_max_inode_size(c);
+ if (c->max_inode_sz > MAX_LFS_FILESIZE)
+ sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE;
+ sb->s_op = &ubifs_super_operations;
+
+ mutex_lock(&c->umount_mutex);
+ err = mount_ubifs(c);
+ if (err) {
+ ubifs_assert(err < 0);
+ goto out_unlock;
+ }
+
+ /* Read the root inode */
+ root = ubifs_iget(sb, UBIFS_ROOT_INO);
+ if (IS_ERR(root)) {
+ err = PTR_ERR(root);
+ goto out_umount;
+ }
+
+ sb->s_root = d_alloc_root(root);
+ if (!sb->s_root)
+ goto out_iput;
+
+ mutex_unlock(&c->umount_mutex);
+
+ return 0;
+
+out_iput:
+ iput(root);
+out_umount:
+ ubifs_umount(c);
+out_unlock:
+ mutex_unlock(&c->umount_mutex);
+out_bdi:
+ bdi_destroy(&c->bdi);
+out_close:
+ ubi_close_volume(c->ubi);
+out_free:
+ kfree(c);
+ return err;
+}
+
+static int sb_test(struct super_block *sb, void *data)
+{
+ dev_t *dev = data;
+
+ return sb->s_dev == *dev;
+}
+
+static int sb_set(struct super_block *sb, void *data)
+{
+ dev_t *dev = data;
+
+ sb->s_dev = *dev;
+ return 0;
+}
+
+static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
+ const char *name, void *data, struct vfsmount *mnt)
+{
+ struct ubi_volume_desc *ubi;
+ struct ubi_volume_info vi;
+ struct super_block *sb;
+ int err;
+
+ dbg_gen("name %s, flags %#x", name, flags);
+
+ /*
+ * Get UBI device number and volume ID. Mount it read-only so far
+ * because this might be a new mount point, and UBI allows only one
+ * read-write user at a time.
+ */
+ ubi = open_ubi(name, UBI_READONLY);
+ if (IS_ERR(ubi)) {
+ ubifs_err("cannot open \"%s\", error %d",
+ name, (int)PTR_ERR(ubi));
+ return PTR_ERR(ubi);
+ }
+ ubi_get_volume_info(ubi, &vi);
+
+ dbg_gen("opened ubi%d_%d", vi.ubi_num, vi.vol_id);
+
+ sb = sget(fs_type, &sb_test, &sb_set, &vi.cdev);
+ if (IS_ERR(sb)) {
+ err = PTR_ERR(sb);
+ goto out_close;
+ }
+
+ if (sb->s_root) {
+ /* A new mount point for already mounted UBIFS */
+ dbg_gen("this ubi volume is already mounted");
+ if ((flags ^ sb->s_flags) & MS_RDONLY) {
+ err = -EBUSY;
+ goto out_deact;
+ }
+ } else {
+ sb->s_flags = flags;
+ /*
+ * Pass 'ubi' to 'fill_super()' in sb->s_fs_info where it is
+ * replaced by 'c'.
+ */
+ sb->s_fs_info = ubi;
+ err = ubifs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
+ if (err)
+ goto out_deact;
+ /* We do not support atime */
+ sb->s_flags |= MS_ACTIVE | MS_NOATIME;
+ }
+
+ /* 'fill_super()' opens ubi again so we must close it here */
+ ubi_close_volume(ubi);
+
+ return simple_set_mnt(mnt, sb);
+
+out_deact:
+ up_write(&sb->s_umount);
+ deactivate_super(sb);
+out_close:
+ ubi_close_volume(ubi);
+ return err;
+}
+
+static void ubifs_kill_sb(struct super_block *sb)
+{
+ struct ubifs_info *c = sb->s_fs_info;
+
+ /*
+ * We do 'commit_on_unmount()' here instead of 'ubifs_put_super()'
+ * in order to be outside BKL.
+ */
+ if (sb->s_root && !(sb->s_flags & MS_RDONLY))
+ commit_on_unmount(c);
+ /* The un-mount routine is actually done in put_super() */
+ generic_shutdown_super(sb);
+}
+
+static struct file_system_type ubifs_fs_type = {
+ .name = "ubifs",
+ .owner = THIS_MODULE,
+ .get_sb = ubifs_get_sb,
+ .kill_sb = ubifs_kill_sb
+};
+
+/*
+ * Inode slab cache constructor.
+ */
+static void inode_slab_ctor(void *obj)
+{
+ struct ubifs_inode *ui = obj;
+ inode_init_once(&ui->vfs_inode);
+}
+
+static int __init ubifs_init(void)
+{
+ int err;
+
+ BUILD_BUG_ON(sizeof(struct ubifs_ch) != 24);
+
+ /* Make sure node sizes are 8-byte aligned */
+ BUILD_BUG_ON(UBIFS_CH_SZ & 7);
+ BUILD_BUG_ON(UBIFS_INO_NODE_SZ & 7);
+ BUILD_BUG_ON(UBIFS_DENT_NODE_SZ & 7);
+ BUILD_BUG_ON(UBIFS_XENT_NODE_SZ & 7);
+ BUILD_BUG_ON(UBIFS_DATA_NODE_SZ & 7);
+ BUILD_BUG_ON(UBIFS_TRUN_NODE_SZ & 7);
+ BUILD_BUG_ON(UBIFS_SB_NODE_SZ & 7);
+ BUILD_BUG_ON(UBIFS_MST_NODE_SZ & 7);
+ BUILD_BUG_ON(UBIFS_REF_NODE_SZ & 7);
+ BUILD_BUG_ON(UBIFS_CS_NODE_SZ & 7);
+ BUILD_BUG_ON(UBIFS_ORPH_NODE_SZ & 7);
+
+ BUILD_BUG_ON(UBIFS_MAX_DENT_NODE_SZ & 7);
+ BUILD_BUG_ON(UBIFS_MAX_XENT_NODE_SZ & 7);
+ BUILD_BUG_ON(UBIFS_MAX_DATA_NODE_SZ & 7);
+ BUILD_BUG_ON(UBIFS_MAX_INO_NODE_SZ & 7);
+ BUILD_BUG_ON(UBIFS_MAX_NODE_SZ & 7);
+ BUILD_BUG_ON(MIN_WRITE_SZ & 7);
+
+ /* Check min. node size */
+ BUILD_BUG_ON(UBIFS_INO_NODE_SZ < MIN_WRITE_SZ);
+ BUILD_BUG_ON(UBIFS_DENT_NODE_SZ < MIN_WRITE_SZ);
+ BUILD_BUG_ON(UBIFS_XENT_NODE_SZ < MIN_WRITE_SZ);
+ BUILD_BUG_ON(UBIFS_TRUN_NODE_SZ < MIN_WRITE_SZ);
+
+ BUILD_BUG_ON(UBIFS_MAX_DENT_NODE_SZ > UBIFS_MAX_NODE_SZ);
+ BUILD_BUG_ON(UBIFS_MAX_XENT_NODE_SZ > UBIFS_MAX_NODE_SZ);
+ BUILD_BUG_ON(UBIFS_MAX_DATA_NODE_SZ > UBIFS_MAX_NODE_SZ);
+ BUILD_BUG_ON(UBIFS_MAX_INO_NODE_SZ > UBIFS_MAX_NODE_SZ);
+
+ /* Defined node sizes */
+ BUILD_BUG_ON(UBIFS_SB_NODE_SZ != 4096);
+ BUILD_BUG_ON(UBIFS_MST_NODE_SZ != 512);
+ BUILD_BUG_ON(UBIFS_INO_NODE_SZ != 160);
+ BUILD_BUG_ON(UBIFS_REF_NODE_SZ != 64);
+
+ /*
+ * We require that PAGE_CACHE_SIZE is greater-than-or-equal-to
+ * UBIFS_BLOCK_SIZE. It is assumed that both are powers of 2.
+ */
+ if (PAGE_CACHE_SIZE < UBIFS_BLOCK_SIZE) {
+ ubifs_err("VFS page cache size is %u bytes, but UBIFS requires"
+ " at least 4096 bytes",
+ (unsigned int)PAGE_CACHE_SIZE);
+ return -EINVAL;
+ }
+
+ err = register_filesystem(&ubifs_fs_type);
+ if (err) {
+ ubifs_err("cannot register file system, error %d", err);
+ return err;
+ }
+
+ err = -ENOMEM;
+ ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab",
+ sizeof(struct ubifs_inode), 0,
+ SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT,
+ &inode_slab_ctor);
+ if (!ubifs_inode_slab)
+ goto out_reg;
+
+ register_shrinker(&ubifs_shrinker_info);
+
+ err = ubifs_compressors_init();
+ if (err)
+ goto out_compr;
+
+ return 0;
+
+out_compr:
+ unregister_shrinker(&ubifs_shrinker_info);
+ kmem_cache_destroy(ubifs_inode_slab);
+out_reg:
+ unregister_filesystem(&ubifs_fs_type);
+ return err;
+}
+/* late_initcall to let compressors initialize first */
+late_initcall(ubifs_init);
+
+static void __exit ubifs_exit(void)
+{
+ ubifs_assert(list_empty(&ubifs_infos));
+ ubifs_assert(atomic_long_read(&ubifs_clean_zn_cnt) == 0);
+
+ ubifs_compressors_exit();
+ unregister_shrinker(&ubifs_shrinker_info);
+ kmem_cache_destroy(ubifs_inode_slab);
+ unregister_filesystem(&ubifs_fs_type);
+}
+module_exit(ubifs_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_VERSION(__stringify(UBIFS_VERSION));
+MODULE_AUTHOR("Artem Bityutskiy, Adrian Hunter");
+MODULE_DESCRIPTION("UBIFS - UBI File System");
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
new file mode 100644
index 00000000000..7634c597088
--- /dev/null
+++ b/fs/ubifs/tnc.c
@@ -0,0 +1,2962 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ * Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file implements TNC (Tree Node Cache) which caches indexing nodes of
+ * the UBIFS B-tree.
+ *
+ * At the moment the locking rules of the TNC tree are quite simple and
+ * straightforward. We just have a mutex and lock it when we traverse the
+ * tree. If a znode is not in memory, we read it from flash while still having
+ * the mutex locked.
+ */
+
+#include <linux/crc32.h>
+#include "ubifs.h"
+
+/*
+ * Returned codes of 'matches_name()' and 'fallible_matches_name()' functions.
+ * @NAME_LESS: name corresponding to the first argument is less than second
+ * @NAME_MATCHES: names match
+ * @NAME_GREATER: name corresponding to the second argument is greater than
+ * first
+ * @NOT_ON_MEDIA: node referred by zbranch does not exist on the media
+ *
+ * These constants were introduce to improve readability.
+ */
+enum {
+ NAME_LESS = 0,
+ NAME_MATCHES = 1,
+ NAME_GREATER = 2,
+ NOT_ON_MEDIA = 3,
+};
+
+/**
+ * insert_old_idx - record an index node obsoleted since the last commit start.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number of obsoleted index node
+ * @offs: offset of obsoleted index node
+ *
+ * Returns %0 on success, and a negative error code on failure.
+ *
+ * For recovery, there must always be a complete intact version of the index on
+ * flash at all times. That is called the "old index". It is the index as at the
+ * time of the last successful commit. Many of the index nodes in the old index
+ * may be dirty, but they must not be erased until the next successful commit
+ * (at which point that index becomes the old index).
+ *
+ * That means that the garbage collection and the in-the-gaps method of
+ * committing must be able to determine if an index node is in the old index.
+ * Most of the old index nodes can be found by looking up the TNC using the
+ * 'lookup_znode()' function. However, some of the old index nodes may have
+ * been deleted from the current index or may have been changed so much that
+ * they cannot be easily found. In those cases, an entry is added to an RB-tree.
+ * That is what this function does. The RB-tree is ordered by LEB number and
+ * offset because they uniquely identify the old index node.
+ */
+static int insert_old_idx(struct ubifs_info *c, int lnum, int offs)
+{
+ struct ubifs_old_idx *old_idx, *o;
+ struct rb_node **p, *parent = NULL;
+
+ old_idx = kmalloc(sizeof(struct ubifs_old_idx), GFP_NOFS);
+ if (unlikely(!old_idx))
+ return -ENOMEM;
+ old_idx->lnum = lnum;
+ old_idx->offs = offs;
+
+ p = &c->old_idx.rb_node;
+ while (*p) {
+ parent = *p;
+ o = rb_entry(parent, struct ubifs_old_idx, rb);
+ if (lnum < o->lnum)
+ p = &(*p)->rb_left;
+ else if (lnum > o->lnum)
+ p = &(*p)->rb_right;
+ else if (offs < o->offs)
+ p = &(*p)->rb_left;
+ else if (offs > o->offs)
+ p = &(*p)->rb_right;
+ else {
+ ubifs_err("old idx added twice!");
+ kfree(old_idx);
+ return 0;
+ }
+ }
+ rb_link_node(&old_idx->rb, parent, p);
+ rb_insert_color(&old_idx->rb, &c->old_idx);
+ return 0;
+}
+
+/**
+ * insert_old_idx_znode - record a znode obsoleted since last commit start.
+ * @c: UBIFS file-system description object
+ * @znode: znode of obsoleted index node
+ *
+ * Returns %0 on success, and a negative error code on failure.
+ */
+int insert_old_idx_znode(struct ubifs_info *c, struct ubifs_znode *znode)
+{
+ if (znode->parent) {
+ struct ubifs_zbranch *zbr;
+
+ zbr = &znode->parent->zbranch[znode->iip];
+ if (zbr->len)
+ return insert_old_idx(c, zbr->lnum, zbr->offs);
+ } else
+ if (c->zroot.len)
+ return insert_old_idx(c, c->zroot.lnum,
+ c->zroot.offs);
+ return 0;
+}
+
+/**
+ * ins_clr_old_idx_znode - record a znode obsoleted since last commit start.
+ * @c: UBIFS file-system description object
+ * @znode: znode of obsoleted index node
+ *
+ * Returns %0 on success, and a negative error code on failure.
+ */
+static int ins_clr_old_idx_znode(struct ubifs_info *c,
+ struct ubifs_znode *znode)
+{
+ int err;
+
+ if (znode->parent) {
+ struct ubifs_zbranch *zbr;
+
+ zbr = &znode->parent->zbranch[znode->iip];
+ if (zbr->len) {
+ err = insert_old_idx(c, zbr->lnum, zbr->offs);
+ if (err)
+ return err;
+ zbr->lnum = 0;
+ zbr->offs = 0;
+ zbr->len = 0;
+ }
+ } else
+ if (c->zroot.len) {
+ err = insert_old_idx(c, c->zroot.lnum, c->zroot.offs);
+ if (err)
+ return err;
+ c->zroot.lnum = 0;
+ c->zroot.offs = 0;
+ c->zroot.len = 0;
+ }
+ return 0;
+}
+
+/**
+ * destroy_old_idx - destroy the old_idx RB-tree.
+ * @c: UBIFS file-system description object
+ *
+ * During start commit, the old_idx RB-tree is used to avoid overwriting index
+ * nodes that were in the index last commit but have since been deleted. This
+ * is necessary for recovery i.e. the old index must be kept intact until the
+ * new index is successfully written. The old-idx RB-tree is used for the
+ * in-the-gaps method of writing index nodes and is destroyed every commit.
+ */
+void destroy_old_idx(struct ubifs_info *c)
+{
+ struct rb_node *this = c->old_idx.rb_node;
+ struct ubifs_old_idx *old_idx;
+
+ while (this) {
+ if (this->rb_left) {
+ this = this->rb_left;
+ continue;
+ } else if (this->rb_right) {
+ this = this->rb_right;
+ continue;
+ }
+ old_idx = rb_entry(this, struct ubifs_old_idx, rb);
+ this = rb_parent(this);
+ if (this) {
+ if (this->rb_left == &old_idx->rb)
+ this->rb_left = NULL;
+ else
+ this->rb_right = NULL;
+ }
+ kfree(old_idx);
+ }
+ c->old_idx = RB_ROOT;
+}
+
+/**
+ * copy_znode - copy a dirty znode.
+ * @c: UBIFS file-system description object
+ * @znode: znode to copy
+ *
+ * A dirty znode being committed may not be changed, so it is copied.
+ */
+static struct ubifs_znode *copy_znode(struct ubifs_info *c,
+ struct ubifs_znode *znode)
+{
+ struct ubifs_znode *zn;
+
+ zn = kmalloc(c->max_znode_sz, GFP_NOFS);
+ if (unlikely(!zn))
+ return ERR_PTR(-ENOMEM);
+
+ memcpy(zn, znode, c->max_znode_sz);
+ zn->cnext = NULL;
+ __set_bit(DIRTY_ZNODE, &zn->flags);
+ __clear_bit(COW_ZNODE, &zn->flags);
+
+ ubifs_assert(!test_bit(OBSOLETE_ZNODE, &znode->flags));
+ __set_bit(OBSOLETE_ZNODE, &znode->flags);
+
+ if (znode->level != 0) {
+ int i;
+ const int n = zn->child_cnt;
+
+ /* The children now have new parent */
+ for (i = 0; i < n; i++) {
+ struct ubifs_zbranch *zbr = &zn->zbranch[i];
+
+ if (zbr->znode)
+ zbr->znode->parent = zn;
+ }
+ }
+
+ atomic_long_inc(&c->dirty_zn_cnt);
+ return zn;
+}
+
+/**
+ * add_idx_dirt - add dirt due to a dirty znode.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number of index node
+ * @dirt: size of index node
+ *
+ * This function updates lprops dirty space and the new size of the index.
+ */
+static int add_idx_dirt(struct ubifs_info *c, int lnum, int dirt)
+{
+ c->calc_idx_sz -= ALIGN(dirt, 8);
+ return ubifs_add_dirt(c, lnum, dirt);
+}
+
+/**
+ * dirty_cow_znode - ensure a znode is not being committed.
+ * @c: UBIFS file-system description object
+ * @zbr: branch of znode to check
+ *
+ * Returns dirtied znode on success or negative error code on failure.
+ */
+static struct ubifs_znode *dirty_cow_znode(struct ubifs_info *c,
+ struct ubifs_zbranch *zbr)
+{
+ struct ubifs_znode *znode = zbr->znode;
+ struct ubifs_znode *zn;
+ int err;
+
+ if (!test_bit(COW_ZNODE, &znode->flags)) {
+ /* znode is not being committed */
+ if (!test_and_set_bit(DIRTY_ZNODE, &znode->flags)) {
+ atomic_long_inc(&c->dirty_zn_cnt);
+ atomic_long_dec(&c->clean_zn_cnt);
+ atomic_long_dec(&ubifs_clean_zn_cnt);
+ err = add_idx_dirt(c, zbr->lnum, zbr->len);
+ if (unlikely(err))
+ return ERR_PTR(err);
+ }
+ return znode;
+ }
+
+ zn = copy_znode(c, znode);
+ if (unlikely(IS_ERR(zn)))
+ return zn;
+
+ if (zbr->len) {
+ err = insert_old_idx(c, zbr->lnum, zbr->offs);
+ if (unlikely(err))
+ return ERR_PTR(err);
+ err = add_idx_dirt(c, zbr->lnum, zbr->len);
+ } else
+ err = 0;
+
+ zbr->znode = zn;
+ zbr->lnum = 0;
+ zbr->offs = 0;
+ zbr->len = 0;
+
+ if (unlikely(err))
+ return ERR_PTR(err);
+ return zn;
+}
+
+/**
+ * lnc_add - add a leaf node to the leaf node cache.
+ * @c: UBIFS file-system description object
+ * @zbr: zbranch of leaf node
+ * @node: leaf node
+ *
+ * Leaf nodes are non-index nodes directory entry nodes or data nodes. The
+ * purpose of the leaf node cache is to save re-reading the same leaf node over
+ * and over again. Most things are cached by VFS, however the file system must
+ * cache directory entries for readdir and for resolving hash collisions. The
+ * present implementation of the leaf node cache is extremely simple, and
+ * allows for error returns that are not used but that may be needed if a more
+ * complex implementation is created.
+ *
+ * Note, this function does not add the @node object to LNC directly, but
+ * allocates a copy of the object and adds the copy to LNC. The reason for this
+ * is that @node has been allocated outside of the TNC subsystem and will be
+ * used with @c->tnc_mutex unlock upon return from the TNC subsystem. But LNC
+ * may be changed at any time, e.g. freed by the shrinker.
+ */
+static int lnc_add(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+ const void *node)
+{
+ int err;
+ void *lnc_node;
+ const struct ubifs_dent_node *dent = node;
+
+ ubifs_assert(!zbr->leaf);
+ ubifs_assert(zbr->len != 0);
+ ubifs_assert(is_hash_key(c, &zbr->key));
+
+ err = ubifs_validate_entry(c, dent);
+ if (err) {
+ dbg_dump_stack();
+ dbg_dump_node(c, dent);
+ return err;
+ }
+
+ lnc_node = kmalloc(zbr->len, GFP_NOFS);
+ if (!lnc_node)
+ /* We don't have to have the cache, so no error */
+ return 0;
+
+ memcpy(lnc_node, node, zbr->len);
+ zbr->leaf = lnc_node;
+ return 0;
+}
+
+ /**
+ * lnc_add_directly - add a leaf node to the leaf-node-cache.
+ * @c: UBIFS file-system description object
+ * @zbr: zbranch of leaf node
+ * @node: leaf node
+ *
+ * This function is similar to 'lnc_add()', but it does not create a copy of
+ * @node but inserts @node to TNC directly.
+ */
+static int lnc_add_directly(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+ void *node)
+{
+ int err;
+
+ ubifs_assert(!zbr->leaf);
+ ubifs_assert(zbr->len != 0);
+
+ err = ubifs_validate_entry(c, node);
+ if (err) {
+ dbg_dump_stack();
+ dbg_dump_node(c, node);
+ return err;
+ }
+
+ zbr->leaf = node;
+ return 0;
+}
+
+/**
+ * lnc_free - remove a leaf node from the leaf node cache.
+ * @zbr: zbranch of leaf node
+ * @node: leaf node
+ */
+static void lnc_free(struct ubifs_zbranch *zbr)
+{
+ if (!zbr->leaf)
+ return;
+ kfree(zbr->leaf);
+ zbr->leaf = NULL;
+}
+
+/**
+ * tnc_read_node_nm - read a "hashed" leaf node.
+ * @c: UBIFS file-system description object
+ * @zbr: key and position of the node
+ * @node: node is returned here
+ *
+ * This function reads a "hashed" node defined by @zbr from the leaf node cache
+ * (in it is there) or from the hash media, in which case the node is also
+ * added to LNC. Returns zero in case of success or a negative negative error
+ * code in case of failure.
+ */
+static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+ void *node)
+{
+ int err;
+
+ ubifs_assert(is_hash_key(c, &zbr->key));
+
+ if (zbr->leaf) {
+ /* Read from the leaf node cache */
+ ubifs_assert(zbr->len != 0);
+ memcpy(node, zbr->leaf, zbr->len);
+ return 0;
+ }
+
+ err = ubifs_tnc_read_node(c, zbr, node);
+ if (err)
+ return err;
+
+ /* Add the node to the leaf node cache */
+ err = lnc_add(c, zbr, node);
+ return err;
+}
+
+/**
+ * try_read_node - read a node if it is a node.
+ * @c: UBIFS file-system description object
+ * @buf: buffer to read to
+ * @type: node type
+ * @len: node length (not aligned)
+ * @lnum: LEB number of node to read
+ * @offs: offset of node to read
+ *
+ * This function tries to read a node of known type and length, checks it and
+ * stores it in @buf. This function returns %1 if a node is present and %0 if
+ * a node is not present. A negative error code is returned for I/O errors.
+ * This function performs that same function as ubifs_read_node except that
+ * it does not require that there is actually a node present and instead
+ * the return code indicates if a node was read.
+ */
+static int try_read_node(const struct ubifs_info *c, void *buf, int type,
+ int len, int lnum, int offs)
+{
+ int err, node_len;
+ struct ubifs_ch *ch = buf;
+ uint32_t crc, node_crc;
+
+ dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len);
+
+ err = ubi_read(c->ubi, lnum, buf, offs, len);
+ if (err) {
+ ubifs_err("cannot read node type %d from LEB %d:%d, error %d",
+ type, lnum, offs, err);
+ return err;
+ }
+
+ if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC)
+ return 0;
+
+ if (ch->node_type != type)
+ return 0;
+
+ node_len = le32_to_cpu(ch->len);
+ if (node_len != len)
+ return 0;
+
+ crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
+ node_crc = le32_to_cpu(ch->crc);
+ if (crc != node_crc)
+ return 0;
+
+ return 1;
+}
+
+/**
+ * fallible_read_node - try to read a leaf node.
+ * @c: UBIFS file-system description object
+ * @key: key of node to read
+ * @zbr: position of node
+ * @node: node returned
+ *
+ * This function tries to read a node and returns %1 if the node is read, %0
+ * if the node is not present, and a negative error code in the case of error.
+ */
+static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key,
+ struct ubifs_zbranch *zbr, void *node)
+{
+ int ret;
+
+ dbg_tnc("LEB %d:%d, key %s", zbr->lnum, zbr->offs, DBGKEY(key));
+
+ ret = try_read_node(c, node, key_type(c, key), zbr->len, zbr->lnum,
+ zbr->offs);
+ if (ret == 1) {
+ union ubifs_key node_key;
+ struct ubifs_dent_node *dent = node;
+
+ /* All nodes have key in the same place */
+ key_read(c, &dent->key, &node_key);
+ if (keys_cmp(c, key, &node_key) != 0)
+ ret = 0;
+ }
+ if (ret == 0 && c->replaying)
+ dbg_mnt("dangling branch LEB %d:%d len %d, key %s",
+ zbr->lnum, zbr->offs, zbr->len, DBGKEY(key));
+ return ret;
+}
+
+/**
+ * matches_name - determine if a direntry or xattr entry matches a given name.
+ * @c: UBIFS file-system description object
+ * @zbr: zbranch of dent
+ * @nm: name to match
+ *
+ * This function checks if xentry/direntry referred by zbranch @zbr matches name
+ * @nm. Returns %NAME_MATCHES if it does, %NAME_LESS if the name referred by
+ * @zbr is less than @nm, and %NAME_GREATER if it is greater than @nm. In case
+ * of failure, a negative error code is returned.
+ */
+static int matches_name(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+ const struct qstr *nm)
+{
+ struct ubifs_dent_node *dent;
+ int nlen, err;
+
+ /* If possible, match against the dent in the leaf node cache */
+ if (!zbr->leaf) {
+ dent = kmalloc(zbr->len, GFP_NOFS);
+ if (!dent)
+ return -ENOMEM;
+
+ err = ubifs_tnc_read_node(c, zbr, dent);
+ if (err)
+ goto out_free;
+
+ /* Add the node to the leaf node cache */
+ err = lnc_add_directly(c, zbr, dent);
+ if (err)
+ goto out_free;
+ } else
+ dent = zbr->leaf;
+
+ nlen = le16_to_cpu(dent->nlen);
+ err = memcmp(dent->name, nm->name, min_t(int, nlen, nm->len));
+ if (err == 0) {
+ if (nlen == nm->len)
+ return NAME_MATCHES;
+ else if (nlen < nm->len)
+ return NAME_LESS;
+ else
+ return NAME_GREATER;
+ } else if (err < 0)
+ return NAME_LESS;
+ else
+ return NAME_GREATER;
+
+out_free:
+ kfree(dent);
+ return err;
+}
+
+/**
+ * get_znode - get a TNC znode that may not be loaded yet.
+ * @c: UBIFS file-system description object
+ * @znode: parent znode
+ * @n: znode branch slot number
+ *
+ * This function returns the znode or a negative error code.
+ */
+static struct ubifs_znode *get_znode(struct ubifs_info *c,
+ struct ubifs_znode *znode, int n)
+{
+ struct ubifs_zbranch *zbr;
+
+ zbr = &znode->zbranch[n];
+ if (zbr->znode)
+ znode = zbr->znode;
+ else
+ znode = ubifs_load_znode(c, zbr, znode, n);
+ return znode;
+}
+
+/**
+ * tnc_next - find next TNC entry.
+ * @c: UBIFS file-system description object
+ * @zn: znode is passed and returned here
+ * @n: znode branch slot number is passed and returned here
+ *
+ * This function returns %0 if the next TNC entry is found, %-ENOENT if there is
+ * no next entry, or a negative error code otherwise.
+ */
+static int tnc_next(struct ubifs_info *c, struct ubifs_znode **zn, int *n)
+{
+ struct ubifs_znode *znode = *zn;
+ int nn = *n;
+
+ nn += 1;
+ if (nn < znode->child_cnt) {
+ *n = nn;
+ return 0;
+ }
+ while (1) {
+ struct ubifs_znode *zp;
+
+ zp = znode->parent;
+ if (!zp)
+ return -ENOENT;
+ nn = znode->iip + 1;
+ znode = zp;
+ if (nn < znode->child_cnt) {
+ znode = get_znode(c, znode, nn);
+ if (IS_ERR(znode))
+ return PTR_ERR(znode);
+ while (znode->level != 0) {
+ znode = get_znode(c, znode, 0);
+ if (IS_ERR(znode))
+ return PTR_ERR(znode);
+ }
+ nn = 0;
+ break;
+ }
+ }
+ *zn = znode;
+ *n = nn;
+ return 0;
+}
+
+/**
+ * tnc_prev - find previous TNC entry.
+ * @c: UBIFS file-system description object
+ * @zn: znode is returned here
+ * @n: znode branch slot number is passed and returned here
+ *
+ * This function returns %0 if the previous TNC entry is found, %-ENOENT if
+ * there is no next entry, or a negative error code otherwise.
+ */
+static int tnc_prev(struct ubifs_info *c, struct ubifs_znode **zn, int *n)
+{
+ struct ubifs_znode *znode = *zn;
+ int nn = *n;
+
+ if (nn > 0) {
+ *n = nn - 1;
+ return 0;
+ }
+ while (1) {
+ struct ubifs_znode *zp;
+
+ zp = znode->parent;
+ if (!zp)
+ return -ENOENT;
+ nn = znode->iip - 1;
+ znode = zp;
+ if (nn >= 0) {
+ znode = get_znode(c, znode, nn);
+ if (IS_ERR(znode))
+ return PTR_ERR(znode);
+ while (znode->level != 0) {
+ nn = znode->child_cnt - 1;
+ znode = get_znode(c, znode, nn);
+ if (IS_ERR(znode))
+ return PTR_ERR(znode);
+ }
+ nn = znode->child_cnt - 1;
+ break;
+ }
+ }
+ *zn = znode;
+ *n = nn;
+ return 0;
+}
+
+/**
+ * resolve_collision - resolve a collision.
+ * @c: UBIFS file-system description object
+ * @key: key of a directory or extended attribute entry
+ * @zn: znode is returned here
+ * @n: zbranch number is passed and returned here
+ * @nm: name of the entry
+ *
+ * This function is called for "hashed" keys to make sure that the found key
+ * really corresponds to the looked up node (directory or extended attribute
+ * entry). It returns %1 and sets @zn and @n if the collision is resolved.
+ * %0 is returned if @nm is not found and @zn and @n are set to the previous
+ * entry, i.e. to the entry after which @nm could follow if it were in TNC.
+ * This means that @n may be set to %-1 if the leftmost key in @zn is the
+ * previous one. A negative error code is returned on failures.
+ */
+static int resolve_collision(struct ubifs_info *c, const union ubifs_key *key,
+ struct ubifs_znode **zn, int *n,
+ const struct qstr *nm)
+{
+ int err;
+
+ err = matches_name(c, &(*zn)->zbranch[*n], nm);
+ if (unlikely(err < 0))
+ return err;
+ if (err == NAME_MATCHES)
+ return 1;
+
+ if (err == NAME_GREATER) {
+ /* Look left */
+ while (1) {
+ err = tnc_prev(c, zn, n);
+ if (err == -ENOENT) {
+ ubifs_assert(*n == 0);
+ *n = -1;
+ return 0;
+ }
+ if (err < 0)
+ return err;
+ if (keys_cmp(c, &(*zn)->zbranch[*n].key, key)) {
+ /*
+ * We have found the branch after which we would
+ * like to insert, but inserting in this znode
+ * may still be wrong. Consider the following 3
+ * znodes, in the case where we are resolving a
+ * collision with Key2.
+ *
+ * znode zp
+ * ----------------------
+ * level 1 | Key0 | Key1 |
+ * -----------------------
+ * | |
+ * znode za | | znode zb
+ * ------------ ------------
+ * level 0 | Key0 | | Key2 |
+ * ------------ ------------
+ *
+ * The lookup finds Key2 in znode zb. Lets say
+ * there is no match and the name is greater so
+ * we look left. When we find Key0, we end up
+ * here. If we return now, we will insert into
+ * znode za at slot n = 1. But that is invalid
+ * according to the parent's keys. Key2 must
+ * be inserted into znode zb.
+ *
+ * Note, this problem is not relevant for the
+ * case when we go right, because
+ * 'tnc_insert()' would correct the parent key.
+ */
+ if (*n == (*zn)->child_cnt - 1) {
+ err = tnc_next(c, zn, n);
+ if (err) {
+ /* Should be impossible */
+ ubifs_assert(0);
+ if (err == -ENOENT)
+ err = -EINVAL;
+ return err;
+ }
+ ubifs_assert(*n == 0);
+ *n = -1;
+ }
+ return 0;
+ }
+ err = matches_name(c, &(*zn)->zbranch[*n], nm);
+ if (err < 0)
+ return err;
+ if (err == NAME_LESS)
+ return 0;
+ if (err == NAME_MATCHES)
+ return 1;
+ ubifs_assert(err == NAME_GREATER);
+ }
+ } else {
+ int nn = *n;
+ struct ubifs_znode *znode = *zn;
+
+ /* Look right */
+ while (1) {
+ err = tnc_next(c, &znode, &nn);
+ if (err == -ENOENT)
+ return 0;
+ if (err < 0)
+ return err;
+ if (keys_cmp(c, &znode->zbranch[nn].key, key))
+ return 0;
+ err = matches_name(c, &znode->zbranch[nn], nm);
+ if (err < 0)
+ return err;
+ if (err == NAME_GREATER)
+ return 0;
+ *zn = znode;
+ *n = nn;
+ if (err == NAME_MATCHES)
+ return 1;
+ ubifs_assert(err == NAME_LESS);
+ }
+ }
+}
+
+/**
+ * fallible_matches_name - determine if a dent matches a given name.
+ * @c: UBIFS file-system description object
+ * @zbr: zbranch of dent
+ * @nm: name to match
+ *
+ * This is a "fallible" version of 'matches_name()' function which does not
+ * panic if the direntry/xentry referred by @zbr does not exist on the media.
+ *
+ * This function checks if xentry/direntry referred by zbranch @zbr matches name
+ * @nm. Returns %NAME_MATCHES it does, %NAME_LESS if the name referred by @zbr
+ * is less than @nm, %NAME_GREATER if it is greater than @nm, and @NOT_ON_MEDIA
+ * if xentry/direntry referred by @zbr does not exist on the media. A negative
+ * error code is returned in case of failure.
+ */
+static int fallible_matches_name(struct ubifs_info *c,
+ struct ubifs_zbranch *zbr,
+ const struct qstr *nm)
+{
+ struct ubifs_dent_node *dent;
+ int nlen, err;
+
+ /* If possible, match against the dent in the leaf node cache */
+ if (!zbr->leaf) {
+ dent = kmalloc(zbr->len, GFP_NOFS);
+ if (!dent)
+ return -ENOMEM;
+
+ err = fallible_read_node(c, &zbr->key, zbr, dent);
+ if (err < 0)
+ goto out_free;
+ if (err == 0) {
+ /* The node was not present */
+ err = NOT_ON_MEDIA;
+ goto out_free;
+ }
+ ubifs_assert(err == 1);
+
+ err = lnc_add_directly(c, zbr, dent);
+ if (err)
+ goto out_free;
+ } else
+ dent = zbr->leaf;
+
+ nlen = le16_to_cpu(dent->nlen);
+ err = memcmp(dent->name, nm->name, min_t(int, nlen, nm->len));
+ if (err == 0) {
+ if (nlen == nm->len)
+ return NAME_MATCHES;
+ else if (nlen < nm->len)
+ return NAME_LESS;
+ else
+ return NAME_GREATER;
+ } else if (err < 0)
+ return NAME_LESS;
+ else
+ return NAME_GREATER;
+
+out_free:
+ kfree(dent);
+ return err;
+}
+
+/**
+ * fallible_resolve_collision - resolve a collision even if nodes are missing.
+ * @c: UBIFS file-system description object
+ * @key: key
+ * @zn: znode is returned here
+ * @n: branch number is passed and returned here
+ * @nm: name of directory entry
+ * @adding: indicates caller is adding a key to the TNC
+ *
+ * This is a "fallible" version of the 'resolve_collision()' function which
+ * does not panic if one of the nodes referred to by TNC does not exist on the
+ * media. This may happen when replaying the journal if a deleted node was
+ * Garbage-collected and the commit was not done. A branch that refers to a node
+ * that is not present is called a dangling branch. The following are the return
+ * codes for this function:
+ * o if @nm was found, %1 is returned and @zn and @n are set to the found
+ * branch;
+ * o if we are @adding and @nm was not found, %0 is returned;
+ * o if we are not @adding and @nm was not found, but a dangling branch was
+ * found, then %1 is returned and @zn and @n are set to the dangling branch;
+ * o a negative error code is returned in case of failure.
+ */
+static int fallible_resolve_collision(struct ubifs_info *c,
+ const union ubifs_key *key,
+ struct ubifs_znode **zn, int *n,
+ const struct qstr *nm, int adding)
+{
+ struct ubifs_znode *o_znode = NULL, *znode = *zn;
+ int uninitialized_var(o_n), err, cmp, unsure = 0, nn = *n;
+
+ cmp = fallible_matches_name(c, &znode->zbranch[nn], nm);
+ if (unlikely(cmp < 0))
+ return cmp;
+ if (cmp == NAME_MATCHES)
+ return 1;
+ if (cmp == NOT_ON_MEDIA) {
+ o_znode = znode;
+ o_n = nn;
+ /*
+ * We are unlucky and hit a dangling branch straight away.
+ * Now we do not really know where to go to find the needed
+ * branch - to the left or to the right. Well, let's try left.
+ */
+ unsure = 1;
+ } else if (!adding)
+ unsure = 1; /* Remove a dangling branch wherever it is */
+
+ if (cmp == NAME_GREATER || unsure) {
+ /* Look left */
+ while (1) {
+ err = tnc_prev(c, zn, n);
+ if (err == -ENOENT) {
+ ubifs_assert(*n == 0);
+ *n = -1;
+ break;
+ }
+ if (err < 0)
+ return err;
+ if (keys_cmp(c, &(*zn)->zbranch[*n].key, key)) {
+ /* See comments in 'resolve_collision()' */
+ if (*n == (*zn)->child_cnt - 1) {
+ err = tnc_next(c, zn, n);
+ if (err) {
+ /* Should be impossible */
+ ubifs_assert(0);
+ if (err == -ENOENT)
+ err = -EINVAL;
+ return err;
+ }
+ ubifs_assert(*n == 0);
+ *n = -1;
+ }
+ break;
+ }
+ err = fallible_matches_name(c, &(*zn)->zbranch[*n], nm);
+ if (err < 0)
+ return err;
+ if (err == NAME_MATCHES)
+ return 1;
+ if (err == NOT_ON_MEDIA) {
+ o_znode = *zn;
+ o_n = *n;
+ continue;
+ }
+ if (!adding)
+ continue;
+ if (err == NAME_LESS)
+ break;
+ else
+ unsure = 0;
+ }
+ }
+
+ if (cmp == NAME_LESS || unsure) {
+ /* Look right */
+ *zn = znode;
+ *n = nn;
+ while (1) {
+ err = tnc_next(c, &znode, &nn);
+ if (err == -ENOENT)
+ break;
+ if (err < 0)
+ return err;
+ if (keys_cmp(c, &znode->zbranch[nn].key, key))
+ break;
+ err = fallible_matches_name(c, &znode->zbranch[nn], nm);
+ if (err < 0)
+ return err;
+ if (err == NAME_GREATER)
+ break;
+ *zn = znode;
+ *n = nn;
+ if (err == NAME_MATCHES)
+ return 1;
+ if (err == NOT_ON_MEDIA) {
+ o_znode = znode;
+ o_n = nn;
+ }
+ }
+ }
+
+ /* Never match a dangling branch when adding */
+ if (adding || !o_znode)
+ return 0;
+
+ dbg_mnt("dangling match LEB %d:%d len %d %s",
+ o_znode->zbranch[o_n].lnum, o_znode->zbranch[o_n].offs,
+ o_znode->zbranch[o_n].len, DBGKEY(key));
+ *zn = o_znode;
+ *n = o_n;
+ return 1;
+}
+
+/**
+ * matches_position - determine if a zbranch matches a given position.
+ * @zbr: zbranch of dent
+ * @lnum: LEB number of dent to match
+ * @offs: offset of dent to match
+ *
+ * This function returns %1 if @lnum:@offs matches, and %0 otherwise.
+ */
+static int matches_position(struct ubifs_zbranch *zbr, int lnum, int offs)
+{
+ if (zbr->lnum == lnum && zbr->offs == offs)
+ return 1;
+ else
+ return 0;
+}
+
+/**
+ * resolve_collision_directly - resolve a collision directly.
+ * @c: UBIFS file-system description object
+ * @key: key of directory entry
+ * @zn: znode is passed and returned here
+ * @n: zbranch number is passed and returned here
+ * @lnum: LEB number of dent node to match
+ * @offs: offset of dent node to match
+ *
+ * This function is used for "hashed" keys to make sure the found directory or
+ * extended attribute entry node is what was looked for. It is used when the
+ * flash address of the right node is known (@lnum:@offs) which makes it much
+ * easier to resolve collisions (no need to read entries and match full
+ * names). This function returns %1 and sets @zn and @n if the collision is
+ * resolved, %0 if @lnum:@offs is not found and @zn and @n are set to the
+ * previous directory entry. Otherwise a negative error code is returned.
+ */
+static int resolve_collision_directly(struct ubifs_info *c,
+ const union ubifs_key *key,
+ struct ubifs_znode **zn, int *n,
+ int lnum, int offs)
+{
+ struct ubifs_znode *znode;
+ int nn, err;
+
+ znode = *zn;
+ nn = *n;
+ if (matches_position(&znode->zbranch[nn], lnum, offs))
+ return 1;
+
+ /* Look left */
+ while (1) {
+ err = tnc_prev(c, &znode, &nn);
+ if (err == -ENOENT)
+ break;
+ if (err < 0)
+ return err;
+ if (keys_cmp(c, &znode->zbranch[nn].key, key))
+ break;
+ if (matches_position(&znode->zbranch[nn], lnum, offs)) {
+ *zn = znode;
+ *n = nn;
+ return 1;
+ }
+ }
+
+ /* Look right */
+ znode = *zn;
+ nn = *n;
+ while (1) {
+ err = tnc_next(c, &znode, &nn);
+ if (err == -ENOENT)
+ return 0;
+ if (err < 0)
+ return err;
+ if (keys_cmp(c, &znode->zbranch[nn].key, key))
+ return 0;
+ *zn = znode;
+ *n = nn;
+ if (matches_position(&znode->zbranch[nn], lnum, offs))
+ return 1;
+ }
+}
+
+/**
+ * dirty_cow_bottom_up - dirty a znode and its ancestors.
+ * @c: UBIFS file-system description object
+ * @znode: znode to dirty
+ *
+ * If we do not have a unique key that resides in a znode, then we cannot
+ * dirty that znode from the top down (i.e. by using lookup_level0_dirty)
+ * This function records the path back to the last dirty ancestor, and then
+ * dirties the znodes on that path.
+ */
+static struct ubifs_znode *dirty_cow_bottom_up(struct ubifs_info *c,
+ struct ubifs_znode *znode)
+{
+ struct ubifs_znode *zp;
+ int *path = c->bottom_up_buf, p = 0;
+
+ ubifs_assert(c->zroot.znode);
+ ubifs_assert(znode);
+ if (c->zroot.znode->level > BOTTOM_UP_HEIGHT) {
+ kfree(c->bottom_up_buf);
+ c->bottom_up_buf = kmalloc(c->zroot.znode->level * sizeof(int),
+ GFP_NOFS);
+ if (!c->bottom_up_buf)
+ return ERR_PTR(-ENOMEM);
+ path = c->bottom_up_buf;
+ }
+ if (c->zroot.znode->level) {
+ /* Go up until parent is dirty */
+ while (1) {
+ int n;
+
+ zp = znode->parent;
+ if (!zp)
+ break;
+ n = znode->iip;
+ ubifs_assert(p < c->zroot.znode->level);
+ path[p++] = n;
+ if (!zp->cnext && ubifs_zn_dirty(znode))
+ break;
+ znode = zp;
+ }
+ }
+
+ /* Come back down, dirtying as we go */
+ while (1) {
+ struct ubifs_zbranch *zbr;
+
+ zp = znode->parent;
+ if (zp) {
+ ubifs_assert(path[p - 1] >= 0);
+ ubifs_assert(path[p - 1] < zp->child_cnt);
+ zbr = &zp->zbranch[path[--p]];
+ znode = dirty_cow_znode(c, zbr);
+ } else {
+ ubifs_assert(znode == c->zroot.znode);
+ znode = dirty_cow_znode(c, &c->zroot);
+ }
+ if (unlikely(IS_ERR(znode)) || !p)
+ break;
+ ubifs_assert(path[p - 1] >= 0);
+ ubifs_assert(path[p - 1] < znode->child_cnt);
+ znode = znode->zbranch[path[p - 1]].znode;
+ }
+
+ return znode;
+}
+
+/**
+ * ubifs_lookup_level0 - search for zero-level znode.
+ * @c: UBIFS file-system description object
+ * @key: key to lookup
+ * @zn: znode is returned here
+ * @n: znode branch slot number is returned here
+ *
+ * This function looks up the TNC tree and search for zero-level znode which
+ * refers key @key. The found zero-level znode is returned in @zn. There are 3
+ * cases:
+ * o exact match, i.e. the found zero-level znode contains key @key, then %1
+ * is returned and slot number of the matched branch is stored in @n;
+ * o not exact match, which means that zero-level znode does not contain
+ * @key, then %0 is returned and slot number of the closed branch is stored
+ * in @n;
+ * o @key is so small that it is even less than the lowest key of the
+ * leftmost zero-level node, then %0 is returned and %0 is stored in @n.
+ *
+ * Note, when the TNC tree is traversed, some znodes may be absent, then this
+ * function reads corresponding indexing nodes and inserts them to TNC. In
+ * case of failure, a negative error code is returned.
+ */
+int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
+ struct ubifs_znode **zn, int *n)
+{
+ int err, exact;
+ struct ubifs_znode *znode;
+ unsigned long time = get_seconds();
+
+ dbg_tnc("search key %s", DBGKEY(key));
+
+ znode = c->zroot.znode;
+ if (unlikely(!znode)) {
+ znode = ubifs_load_znode(c, &c->zroot, NULL, 0);
+ if (IS_ERR(znode))
+ return PTR_ERR(znode);
+ }
+
+ znode->time = time;
+
+ while (1) {
+ struct ubifs_zbranch *zbr;
+
+ exact = ubifs_search_zbranch(c, znode, key, n);
+
+ if (znode->level == 0)
+ break;
+
+ if (*n < 0)
+ *n = 0;
+ zbr = &znode->zbranch[*n];
+
+ if (zbr->znode) {
+ znode->time = time;
+ znode = zbr->znode;
+ continue;
+ }
+
+ /* znode is not in TNC cache, load it from the media */
+ znode = ubifs_load_znode(c, zbr, znode, *n);
+ if (IS_ERR(znode))
+ return PTR_ERR(znode);
+ }
+
+ *zn = znode;
+ if (exact || !is_hash_key(c, key) || *n != -1) {
+ dbg_tnc("found %d, lvl %d, n %d", exact, znode->level, *n);
+ return exact;
+ }
+
+ /*
+ * Here is a tricky place. We have not found the key and this is a
+ * "hashed" key, which may collide. The rest of the code deals with
+ * situations like this:
+ *
+ * | 3 | 5 |
+ * / \
+ * | 3 | 5 | | 6 | 7 | (x)
+ *
+ * Or more a complex example:
+ *
+ * | 1 | 5 |
+ * / \
+ * | 1 | 3 | | 5 | 8 |
+ * \ /
+ * | 5 | 5 | | 6 | 7 | (x)
+ *
+ * In the examples, if we are looking for key "5", we may reach nodes
+ * marked with "(x)". In this case what we have do is to look at the
+ * left and see if there is "5" key there. If there is, we have to
+ * return it.
+ *
+ * Note, this whole situation is possible because we allow to have
+ * elements which are equivalent to the next key in the parent in the
+ * children of current znode. For example, this happens if we split a
+ * znode like this: | 3 | 5 | 5 | 6 | 7 |, which results in something
+ * like this:
+ * | 3 | 5 |
+ * / \
+ * | 3 | 5 | | 5 | 6 | 7 |
+ * ^
+ * And this becomes what is at the first "picture" after key "5" marked
+ * with "^" is removed. What could be done is we could prohibit
+ * splitting in the middle of the colliding sequence. Also, when
+ * removing the leftmost key, we would have to correct the key of the
+ * parent node, which would introduce additional complications. Namely,
+ * if we changed the the leftmost key of the parent znode, the garbage
+ * collector would be unable to find it (GC is doing this when GC'ing
+ * indexing LEBs). Although we already have an additional RB-tree where
+ * we save such changed znodes (see 'ins_clr_old_idx_znode()') until
+ * after the commit. But anyway, this does not look easy to implement
+ * so we did not try this.
+ */
+ err = tnc_prev(c, &znode, n);
+ if (err == -ENOENT) {
+ dbg_tnc("found 0, lvl %d, n -1", znode->level);
+ *n = -1;
+ return 0;
+ }
+ if (unlikely(err < 0))
+ return err;
+ if (keys_cmp(c, key, &znode->zbranch[*n].key)) {
+ dbg_tnc("found 0, lvl %d, n -1", znode->level);
+ *n = -1;
+ return 0;
+ }
+
+ dbg_tnc("found 1, lvl %d, n %d", znode->level, *n);
+ *zn = znode;
+ return 1;
+}
+
+/**
+ * lookup_level0_dirty - search for zero-level znode dirtying.
+ * @c: UBIFS file-system description object
+ * @key: key to lookup
+ * @zn: znode is returned here
+ * @n: znode branch slot number is returned here
+ *
+ * This function looks up the TNC tree and search for zero-level znode which
+ * refers key @key. The found zero-level znode is returned in @zn. There are 3
+ * cases:
+ * o exact match, i.e. the found zero-level znode contains key @key, then %1
+ * is returned and slot number of the matched branch is stored in @n;
+ * o not exact match, which means that zero-level znode does not contain @key
+ * then %0 is returned and slot number of the closed branch is stored in
+ * @n;
+ * o @key is so small that it is even less than the lowest key of the
+ * leftmost zero-level node, then %0 is returned and %-1 is stored in @n.
+ *
+ * Additionally all znodes in the path from the root to the located zero-level
+ * znode are marked as dirty.
+ *
+ * Note, when the TNC tree is traversed, some znodes may be absent, then this
+ * function reads corresponding indexing nodes and inserts them to TNC. In
+ * case of failure, a negative error code is returned.
+ */
+static int lookup_level0_dirty(struct ubifs_info *c, const union ubifs_key *key,
+ struct ubifs_znode **zn, int *n)
+{
+ int err, exact;
+ struct ubifs_znode *znode;
+ unsigned long time = get_seconds();
+
+ dbg_tnc("search and dirty key %s", DBGKEY(key));
+
+ znode = c->zroot.znode;
+ if (unlikely(!znode)) {
+ znode = ubifs_load_znode(c, &c->zroot, NULL, 0);
+ if (IS_ERR(znode))
+ return PTR_ERR(znode);
+ }
+
+ znode = dirty_cow_znode(c, &c->zroot);
+ if (IS_ERR(znode))
+ return PTR_ERR(znode);
+
+ znode->time = time;
+
+ while (1) {
+ struct ubifs_zbranch *zbr;
+
+ exact = ubifs_search_zbranch(c, znode, key, n);
+
+ if (znode->level == 0)
+ break;
+
+ if (*n < 0)
+ *n = 0;
+ zbr = &znode->zbranch[*n];
+
+ if (zbr->znode) {
+ znode->time = time;
+ znode = dirty_cow_znode(c, zbr);
+ if (IS_ERR(znode))
+ return PTR_ERR(znode);
+ continue;
+ }
+
+ /* znode is not in TNC cache, load it from the media */
+ znode = ubifs_load_znode(c, zbr, znode, *n);
+ if (IS_ERR(znode))
+ return PTR_ERR(znode);
+ znode = dirty_cow_znode(c, zbr);
+ if (IS_ERR(znode))
+ return PTR_ERR(znode);
+ }
+
+ *zn = znode;
+ if (exact || !is_hash_key(c, key) || *n != -1) {
+ dbg_tnc("found %d, lvl %d, n %d", exact, znode->level, *n);
+ return exact;
+ }
+
+ /*
+ * See huge comment at 'lookup_level0_dirty()' what is the rest of the
+ * code.
+ */
+ err = tnc_prev(c, &znode, n);
+ if (err == -ENOENT) {
+ *n = -1;
+ dbg_tnc("found 0, lvl %d, n -1", znode->level);
+ return 0;
+ }
+ if (unlikely(err < 0))
+ return err;
+ if (keys_cmp(c, key, &znode->zbranch[*n].key)) {
+ *n = -1;
+ dbg_tnc("found 0, lvl %d, n -1", znode->level);
+ return 0;
+ }
+
+ if (znode->cnext || !ubifs_zn_dirty(znode)) {
+ znode = dirty_cow_bottom_up(c, znode);
+ if (IS_ERR(znode))
+ return PTR_ERR(znode);
+ }
+
+ dbg_tnc("found 1, lvl %d, n %d", znode->level, *n);
+ *zn = znode;
+ return 1;
+}
+
+/**
+ * maybe_leb_gced - determine if a LEB may have been garbage collected.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number
+ * @gc_seq1: garbage collection sequence number
+ *
+ * This function determines if @lnum may have been garbage collected since
+ * sequence number @gc_seq1. If it may have been then %1 is returned, otherwise
+ * %0 is returned.
+ */
+static int maybe_leb_gced(struct ubifs_info *c, int lnum, int gc_seq1)
+{
+ int gc_seq2, gced_lnum;
+
+ gced_lnum = c->gced_lnum;
+ smp_rmb();
+ gc_seq2 = c->gc_seq;
+ /* Same seq means no GC */
+ if (gc_seq1 == gc_seq2)
+ return 0;
+ /* Different by more than 1 means we don't know */
+ if (gc_seq1 + 1 != gc_seq2)
+ return 1;
+ /*
+ * We have seen the sequence number has increased by 1. Now we need to
+ * be sure we read the right LEB number, so read it again.
+ */
+ smp_rmb();
+ if (gced_lnum != c->gced_lnum)
+ return 1;
+ /* Finally we can check lnum */
+ if (gced_lnum == lnum)
+ return 1;
+ return 0;
+}
+
+/**
+ * ubifs_tnc_locate - look up a file-system node and return it and its location.
+ * @c: UBIFS file-system description object
+ * @key: node key to lookup
+ * @node: the node is returned here
+ * @lnum: LEB number is returned here
+ * @offs: offset is returned here
+ *
+ * This function look up and reads node with key @key. The caller has to make
+ * sure the @node buffer is large enough to fit the node. Returns zero in case
+ * of success, %-ENOENT if the node was not found, and a negative error code in
+ * case of failure. The node location can be returned in @lnum and @offs.
+ */
+int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
+ void *node, int *lnum, int *offs)
+{
+ int found, n, err, safely = 0, gc_seq1;
+ struct ubifs_znode *znode;
+ struct ubifs_zbranch zbr, *zt;
+
+again:
+ mutex_lock(&c->tnc_mutex);
+ found = ubifs_lookup_level0(c, key, &znode, &n);
+ if (!found) {
+ err = -ENOENT;
+ goto out;
+ } else if (found < 0) {
+ err = found;
+ goto out;
+ }
+ zt = &znode->zbranch[n];
+ if (lnum) {
+ *lnum = zt->lnum;
+ *offs = zt->offs;
+ }
+ if (is_hash_key(c, key)) {
+ /*
+ * In this case the leaf node cache gets used, so we pass the
+ * address of the zbranch and keep the mutex locked
+ */
+ err = tnc_read_node_nm(c, zt, node);
+ goto out;
+ }
+ if (safely) {
+ err = ubifs_tnc_read_node(c, zt, node);
+ goto out;
+ }
+ /* Drop the TNC mutex prematurely and race with garbage collection */
+ zbr = znode->zbranch[n];
+ gc_seq1 = c->gc_seq;
+ mutex_unlock(&c->tnc_mutex);
+
+ if (ubifs_get_wbuf(c, zbr.lnum)) {
+ /* We do not GC journal heads */
+ err = ubifs_tnc_read_node(c, &zbr, node);
+ return err;
+ }
+
+ err = fallible_read_node(c, key, &zbr, node);
+ if (err <= 0 || maybe_leb_gced(c, zbr.lnum, gc_seq1)) {
+ /*
+ * The node may have been GC'ed out from under us so try again
+ * while keeping the TNC mutex locked.
+ */
+ safely = 1;
+ goto again;
+ }
+ return 0;
+
+out:
+ mutex_unlock(&c->tnc_mutex);
+ return err;
+}
+
+/**
+ * do_lookup_nm- look up a "hashed" node.
+ * @c: UBIFS file-system description object
+ * @key: node key to lookup
+ * @node: the node is returned here
+ * @nm: node name
+ *
+ * This function look up and reads a node which contains name hash in the key.
+ * Since the hash may have collisions, there may be many nodes with the same
+ * key, so we have to sequentially look to all of them until the needed one is
+ * found. This function returns zero in case of success, %-ENOENT if the node
+ * was not found, and a negative error code in case of failure.
+ */
+static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
+ void *node, const struct qstr *nm)
+{
+ int found, n, err;
+ struct ubifs_znode *znode;
+
+ dbg_tnc("name '%.*s' key %s", nm->len, nm->name, DBGKEY(key));
+ mutex_lock(&c->tnc_mutex);
+ found = ubifs_lookup_level0(c, key, &znode, &n);
+ if (!found) {
+ err = -ENOENT;
+ goto out_unlock;
+ } else if (found < 0) {
+ err = found;
+ goto out_unlock;
+ }
+
+ ubifs_assert(n >= 0);
+
+ err = resolve_collision(c, key, &znode, &n, nm);
+ dbg_tnc("rc returned %d, znode %p, n %d", err, znode, n);
+ if (unlikely(err < 0))
+ goto out_unlock;
+ if (err == 0) {
+ err = -ENOENT;
+ goto out_unlock;
+ }
+
+ err = tnc_read_node_nm(c, &znode->zbranch[n], node);
+
+out_unlock:
+ mutex_unlock(&c->tnc_mutex);
+ return err;
+}
+
+/**
+ * ubifs_tnc_lookup_nm - look up a "hashed" node.
+ * @c: UBIFS file-system description object
+ * @key: node key to lookup
+ * @node: the node is returned here
+ * @nm: node name
+ *
+ * This function look up and reads a node which contains name hash in the key.
+ * Since the hash may have collisions, there may be many nodes with the same
+ * key, so we have to sequentially look to all of them until the needed one is
+ * found. This function returns zero in case of success, %-ENOENT if the node
+ * was not found, and a negative error code in case of failure.
+ */
+int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
+ void *node, const struct qstr *nm)
+{
+ int err, len;
+ const struct ubifs_dent_node *dent = node;
+
+ /*
+ * We assume that in most of the cases there are no name collisions and
+ * 'ubifs_tnc_lookup()' returns us the right direntry.
+ */
+ err = ubifs_tnc_lookup(c, key, node);
+ if (err)
+ return err;
+
+ len = le16_to_cpu(dent->nlen);
+ if (nm->len == len && !memcmp(dent->name, nm->name, len))
+ return 0;
+
+ /*
+ * Unluckily, there are hash collisions and we have to iterate over
+ * them look at each direntry with colliding name hash sequentially.
+ */
+ return do_lookup_nm(c, key, node, nm);
+}
+
+/**
+ * correct_parent_keys - correct parent znodes' keys.
+ * @c: UBIFS file-system description object
+ * @znode: znode to correct parent znodes for
+ *
+ * This is a helper function for 'tnc_insert()'. When the key of the leftmost
+ * zbranch changes, keys of parent znodes have to be corrected. This helper
+ * function is called in such situations and corrects the keys if needed.
+ */
+static void correct_parent_keys(const struct ubifs_info *c,
+ struct ubifs_znode *znode)
+{
+ union ubifs_key *key, *key1;
+
+ ubifs_assert(znode->parent);
+ ubifs_assert(znode->iip == 0);
+
+ key = &znode->zbranch[0].key;
+ key1 = &znode->parent->zbranch[0].key;
+
+ while (keys_cmp(c, key, key1) < 0) {
+ key_copy(c, key, key1);
+ znode = znode->parent;
+ znode->alt = 1;
+ if (!znode->parent || znode->iip)
+ break;
+ key1 = &znode->parent->zbranch[0].key;
+ }
+}
+
+/**
+ * insert_zbranch - insert a zbranch into a znode.
+ * @znode: znode into which to insert
+ * @zbr: zbranch to insert
+ * @n: slot number to insert to
+ *
+ * This is a helper function for 'tnc_insert()'. UBIFS does not allow "gaps" in
+ * znode's array of zbranches and keeps zbranches consolidated, so when a new
+ * zbranch has to be inserted to the @znode->zbranches[]' array at the @n-th
+ * slot, zbranches starting from @n have to be moved right.
+ */
+static void insert_zbranch(struct ubifs_znode *znode,
+ const struct ubifs_zbranch *zbr, int n)
+{
+ int i;
+
+ ubifs_assert(ubifs_zn_dirty(znode));
+
+ if (znode->level) {
+ for (i = znode->child_cnt; i > n; i--) {
+ znode->zbranch[i] = znode->zbranch[i - 1];
+ if (znode->zbranch[i].znode)
+ znode->zbranch[i].znode->iip = i;
+ }
+ if (zbr->znode)
+ zbr->znode->iip = n;
+ } else
+ for (i = znode->child_cnt; i > n; i--)
+ znode->zbranch[i] = znode->zbranch[i - 1];
+
+ znode->zbranch[n] = *zbr;
+ znode->child_cnt += 1;
+
+ /*
+ * After inserting at slot zero, the lower bound of the key range of
+ * this znode may have changed. If this znode is subsequently split
+ * then the upper bound of the key range may change, and furthermore
+ * it could change to be lower than the original lower bound. If that
+ * happens, then it will no longer be possible to find this znode in the
+ * TNC using the key from the index node on flash. That is bad because
+ * if it is not found, we will assume it is obsolete and may overwrite
+ * it. Then if there is an unclean unmount, we will start using the
+ * old index which will be broken.
+ *
+ * So we first mark znodes that have insertions at slot zero, and then
+ * if they are split we add their lnum/offs to the old_idx tree.
+ */
+ if (n == 0)
+ znode->alt = 1;
+}
+
+/**
+ * tnc_insert - insert a node into TNC.
+ * @c: UBIFS file-system description object
+ * @znode: znode to insert into
+ * @zbr: branch to insert
+ * @n: slot number to insert new zbranch to
+ *
+ * This function inserts a new node described by @zbr into znode @znode. If
+ * znode does not have a free slot for new zbranch, it is split. Parent znodes
+ * are splat as well if needed. Returns zero in case of success or a negative
+ * error code in case of failure.
+ */
+static int tnc_insert(struct ubifs_info *c, struct ubifs_znode *znode,
+ struct ubifs_zbranch *zbr, int n)
+{
+ struct ubifs_znode *zn, *zi, *zp;
+ int i, keep, move, appending = 0;
+ union ubifs_key *key = &zbr->key;
+
+ ubifs_assert(n >= 0 && n <= c->fanout);
+
+ /* Implement naive insert for now */
+again:
+ zp = znode->parent;
+ if (znode->child_cnt < c->fanout) {
+ ubifs_assert(n != c->fanout);
+ dbg_tnc("inserted at %d level %d, key %s", n, znode->level,
+ DBGKEY(key));
+
+ insert_zbranch(znode, zbr, n);
+
+ /* Ensure parent's key is correct */
+ if (n == 0 && zp && znode->iip == 0)
+ correct_parent_keys(c, znode);
+
+ return 0;
+ }
+
+ /*
+ * Unfortunately, @znode does not have more empty slots and we have to
+ * split it.
+ */
+ dbg_tnc("splitting level %d, key %s", znode->level, DBGKEY(key));
+
+ if (znode->alt)
+ /*
+ * We can no longer be sure of finding this znode by key, so we
+ * record it in the old_idx tree.
+ */
+ ins_clr_old_idx_znode(c, znode);
+
+ zn = kzalloc(c->max_znode_sz, GFP_NOFS);
+ if (!zn)
+ return -ENOMEM;
+ zn->parent = zp;
+ zn->level = znode->level;
+
+ /* Decide where to split */
+ if (znode->level == 0 && n == c->fanout &&
+ key_type(c, key) == UBIFS_DATA_KEY) {
+ union ubifs_key *key1;
+
+ /*
+ * If this is an inode which is being appended - do not split
+ * it because no other zbranches can be inserted between
+ * zbranches of consecutive data nodes anyway.
+ */
+ key1 = &znode->zbranch[n - 1].key;
+ if (key_inum(c, key1) == key_inum(c, key) &&
+ key_type(c, key1) == UBIFS_DATA_KEY &&
+ key_block(c, key1) == key_block(c, key) - 1)
+ appending = 1;
+ }
+
+ if (appending) {
+ keep = c->fanout;
+ move = 0;
+ } else {
+ keep = (c->fanout + 1) / 2;
+ move = c->fanout - keep;
+ }
+
+ /*
+ * Although we don't at present, we could look at the neighbors and see
+ * if we can move some zbranches there.
+ */
+
+ if (n < keep) {
+ /* Insert into existing znode */
+ zi = znode;
+ move += 1;
+ keep -= 1;
+ } else {
+ /* Insert into new znode */
+ zi = zn;
+ n -= keep;
+ /* Re-parent */
+ if (zn->level != 0)
+ zbr->znode->parent = zn;
+ }
+
+ __set_bit(DIRTY_ZNODE, &zn->flags);
+ atomic_long_inc(&c->dirty_zn_cnt);
+
+ zn->child_cnt = move;
+ znode->child_cnt = keep;
+
+ dbg_tnc("moving %d, keeping %d", move, keep);
+
+ /* Move zbranch */
+ for (i = 0; i < move; i++) {
+ zn->zbranch[i] = znode->zbranch[keep + i];
+ /* Re-parent */
+ if (zn->level != 0)
+ if (zn->zbranch[i].znode) {
+ zn->zbranch[i].znode->parent = zn;
+ zn->zbranch[i].znode->iip = i;
+ }
+ }
+
+ /* Insert new key and branch */
+ dbg_tnc("inserting at %d level %d, key %s", n, zn->level, DBGKEY(key));
+
+ insert_zbranch(zi, zbr, n);
+
+ /* Insert new znode (produced by spitting) into the parent */
+ if (zp) {
+ i = n;
+ /* Locate insertion point */
+ n = znode->iip + 1;
+ if (appending && n != c->fanout)
+ appending = 0;
+
+ if (i == 0 && zi == znode && znode->iip == 0)
+ correct_parent_keys(c, znode);
+
+ /* Tail recursion */
+ zbr->key = zn->zbranch[0].key;
+ zbr->znode = zn;
+ zbr->lnum = 0;
+ zbr->offs = 0;
+ zbr->len = 0;
+ znode = zp;
+
+ goto again;
+ }
+
+ /* We have to split root znode */
+ dbg_tnc("creating new zroot at level %d", znode->level + 1);
+
+ zi = kzalloc(c->max_znode_sz, GFP_NOFS);
+ if (!zi)
+ return -ENOMEM;
+
+ zi->child_cnt = 2;
+ zi->level = znode->level + 1;
+
+ __set_bit(DIRTY_ZNODE, &zi->flags);
+ atomic_long_inc(&c->dirty_zn_cnt);
+
+ zi->zbranch[0].key = znode->zbranch[0].key;
+ zi->zbranch[0].znode = znode;
+ zi->zbranch[0].lnum = c->zroot.lnum;
+ zi->zbranch[0].offs = c->zroot.offs;
+ zi->zbranch[0].len = c->zroot.len;
+ zi->zbranch[1].key = zn->zbranch[0].key;
+ zi->zbranch[1].znode = zn;
+
+ c->zroot.lnum = 0;
+ c->zroot.offs = 0;
+ c->zroot.len = 0;
+ c->zroot.znode = zi;
+
+ zn->parent = zi;
+ zn->iip = 1;
+ znode->parent = zi;
+ znode->iip = 0;
+
+ return 0;
+}
+
+/**
+ * ubifs_tnc_add - add a node to TNC.
+ * @c: UBIFS file-system description object
+ * @key: key to add
+ * @lnum: LEB number of node
+ * @offs: node offset
+ * @len: node length
+ *
+ * This function adds a node with key @key to TNC. The node may be new or it may
+ * obsolete some existing one. Returns %0 on success or negative error code on
+ * failure.
+ */
+int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum,
+ int offs, int len)
+{
+ int found, n, err = 0;
+ struct ubifs_znode *znode;
+
+ mutex_lock(&c->tnc_mutex);
+ dbg_tnc("%d:%d, len %d, key %s", lnum, offs, len, DBGKEY(key));
+ found = lookup_level0_dirty(c, key, &znode, &n);
+ if (!found) {
+ struct ubifs_zbranch zbr;
+
+ zbr.znode = NULL;
+ zbr.lnum = lnum;
+ zbr.offs = offs;
+ zbr.len = len;
+ key_copy(c, key, &zbr.key);
+ err = tnc_insert(c, znode, &zbr, n + 1);
+ } else if (found == 1) {
+ struct ubifs_zbranch *zbr = &znode->zbranch[n];
+
+ lnc_free(zbr);
+ err = ubifs_add_dirt(c, zbr->lnum, zbr->len);
+ zbr->lnum = lnum;
+ zbr->offs = offs;
+ zbr->len = len;
+ } else
+ err = found;
+ if (!err)
+ err = dbg_check_tnc(c, 0);
+ mutex_unlock(&c->tnc_mutex);
+
+ return err;
+}
+
+/**
+ * ubifs_tnc_replace - replace a node in the TNC only if the old node is found.
+ * @c: UBIFS file-system description object
+ * @key: key to add
+ * @old_lnum: LEB number of old node
+ * @old_offs: old node offset
+ * @lnum: LEB number of node
+ * @offs: node offset
+ * @len: node length
+ *
+ * This function replaces a node with key @key in the TNC only if the old node
+ * is found. This function is called by garbage collection when node are moved.
+ * Returns %0 on success or negative error code on failure.
+ */
+int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key,
+ int old_lnum, int old_offs, int lnum, int offs, int len)
+{
+ int found, n, err = 0;
+ struct ubifs_znode *znode;
+
+ mutex_lock(&c->tnc_mutex);
+ dbg_tnc("old LEB %d:%d, new LEB %d:%d, len %d, key %s", old_lnum,
+ old_offs, lnum, offs, len, DBGKEY(key));
+ found = lookup_level0_dirty(c, key, &znode, &n);
+ if (found < 0) {
+ err = found;
+ goto out_unlock;
+ }
+
+ if (found == 1) {
+ struct ubifs_zbranch *zbr = &znode->zbranch[n];
+
+ found = 0;
+ if (zbr->lnum == old_lnum && zbr->offs == old_offs) {
+ lnc_free(zbr);
+ err = ubifs_add_dirt(c, zbr->lnum, zbr->len);
+ if (err)
+ goto out_unlock;
+ zbr->lnum = lnum;
+ zbr->offs = offs;
+ zbr->len = len;
+ found = 1;
+ } else if (is_hash_key(c, key)) {
+ found = resolve_collision_directly(c, key, &znode, &n,
+ old_lnum, old_offs);
+ dbg_tnc("rc returned %d, znode %p, n %d, LEB %d:%d",
+ found, znode, n, old_lnum, old_offs);
+ if (found < 0) {
+ err = found;
+ goto out_unlock;
+ }
+
+ if (found) {
+ /* Ensure the znode is dirtied */
+ if (znode->cnext || !ubifs_zn_dirty(znode)) {
+ znode = dirty_cow_bottom_up(c,
+ znode);
+ if (IS_ERR(znode)) {
+ err = PTR_ERR(znode);
+ goto out_unlock;
+ }
+ }
+ zbr = &znode->zbranch[n];
+ lnc_free(zbr);
+ err = ubifs_add_dirt(c, zbr->lnum,
+ zbr->len);
+ if (err)
+ goto out_unlock;
+ zbr->lnum = lnum;
+ zbr->offs = offs;
+ zbr->len = len;
+ }
+ }
+ }
+
+ if (!found)
+ err = ubifs_add_dirt(c, lnum, len);
+
+ if (!err)
+ err = dbg_check_tnc(c, 0);
+
+out_unlock:
+ mutex_unlock(&c->tnc_mutex);
+ return err;
+}
+
+/**
+ * ubifs_tnc_add_nm - add a "hashed" node to TNC.
+ * @c: UBIFS file-system description object
+ * @key: key to add
+ * @lnum: LEB number of node
+ * @offs: node offset
+ * @len: node length
+ * @nm: node name
+ *
+ * This is the same as 'ubifs_tnc_add()' but it should be used with keys which
+ * may have collisions, like directory entry keys.
+ */
+int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
+ int lnum, int offs, int len, const struct qstr *nm)
+{
+ int found, n, err = 0;
+ struct ubifs_znode *znode;
+
+ mutex_lock(&c->tnc_mutex);
+ dbg_tnc("LEB %d:%d, name '%.*s', key %s", lnum, offs, nm->len, nm->name,
+ DBGKEY(key));
+ found = lookup_level0_dirty(c, key, &znode, &n);
+ if (found < 0) {
+ err = found;
+ goto out_unlock;
+ }
+
+ if (found == 1) {
+ if (c->replaying)
+ found = fallible_resolve_collision(c, key, &znode, &n,
+ nm, 1);
+ else
+ found = resolve_collision(c, key, &znode, &n, nm);
+ dbg_tnc("rc returned %d, znode %p, n %d", found, znode, n);
+ if (found < 0) {
+ err = found;
+ goto out_unlock;
+ }
+
+ /* Ensure the znode is dirtied */
+ if (znode->cnext || !ubifs_zn_dirty(znode)) {
+ znode = dirty_cow_bottom_up(c, znode);
+ if (IS_ERR(znode)) {
+ err = PTR_ERR(znode);
+ goto out_unlock;
+ }
+ }
+
+ if (found == 1) {
+ struct ubifs_zbranch *zbr = &znode->zbranch[n];
+
+ lnc_free(zbr);
+ err = ubifs_add_dirt(c, zbr->lnum, zbr->len);
+ zbr->lnum = lnum;
+ zbr->offs = offs;
+ zbr->len = len;
+ goto out_unlock;
+ }
+ }
+
+ if (!found) {
+ struct ubifs_zbranch zbr;
+
+ zbr.znode = NULL;
+ zbr.lnum = lnum;
+ zbr.offs = offs;
+ zbr.len = len;
+ key_copy(c, key, &zbr.key);
+ err = tnc_insert(c, znode, &zbr, n + 1);
+ if (err)
+ goto out_unlock;
+ if (c->replaying) {
+ /*
+ * We did not find it in the index so there may be a
+ * dangling branch still in the index. So we remove it
+ * by passing 'ubifs_tnc_remove_nm()' the same key but
+ * an unmatchable name.
+ */
+ struct qstr noname = { .len = 0, .name = "" };
+
+ err = dbg_check_tnc(c, 0);
+ mutex_unlock(&c->tnc_mutex);
+ if (err)
+ return err;
+ return ubifs_tnc_remove_nm(c, key, &noname);
+ }
+ }
+
+out_unlock:
+ if (!err)
+ err = dbg_check_tnc(c, 0);
+ mutex_unlock(&c->tnc_mutex);
+ return err;
+}
+
+/**
+ * tnc_delete - delete a znode form TNC.
+ * @c: UBIFS file-system description object
+ * @znode: znode to delete from
+ * @n: zbranch slot number to delete
+ *
+ * This function deletes a leaf node from @n-th slot of @znode. Returns zero in
+ * case of success and a negative error code in case of failure.
+ */
+static int tnc_delete(struct ubifs_info *c, struct ubifs_znode *znode, int n)
+{
+ struct ubifs_zbranch *zbr;
+ struct ubifs_znode *zp;
+ int i, err;
+
+ /* Delete without merge for now */
+ ubifs_assert(znode->level == 0);
+ ubifs_assert(n >= 0 && n < c->fanout);
+ dbg_tnc("deleting %s", DBGKEY(&znode->zbranch[n].key));
+
+ zbr = &znode->zbranch[n];
+ lnc_free(zbr);
+
+ err = ubifs_add_dirt(c, zbr->lnum, zbr->len);
+ if (err) {
+ dbg_dump_znode(c, znode);
+ return err;
+ }
+
+ /* We do not "gap" zbranch slots */
+ for (i = n; i < znode->child_cnt - 1; i++)
+ znode->zbranch[i] = znode->zbranch[i + 1];
+ znode->child_cnt -= 1;
+
+ if (znode->child_cnt > 0)
+ return 0;
+
+ /*
+ * This was the last zbranch, we have to delete this znode from the
+ * parent.
+ */
+
+ do {
+ ubifs_assert(!test_bit(OBSOLETE_ZNODE, &znode->flags));
+ ubifs_assert(ubifs_zn_dirty(znode));
+
+ zp = znode->parent;
+ n = znode->iip;
+
+ atomic_long_dec(&c->dirty_zn_cnt);
+
+ err = insert_old_idx_znode(c, znode);
+ if (err)
+ return err;
+
+ if (znode->cnext) {
+ __set_bit(OBSOLETE_ZNODE, &znode->flags);
+ atomic_long_inc(&c->clean_zn_cnt);
+ atomic_long_inc(&ubifs_clean_zn_cnt);
+ } else
+ kfree(znode);
+ znode = zp;
+ } while (znode->child_cnt == 1); /* while removing last child */
+
+ /* Remove from znode, entry n - 1 */
+ znode->child_cnt -= 1;
+ ubifs_assert(znode->level != 0);
+ for (i = n; i < znode->child_cnt; i++) {
+ znode->zbranch[i] = znode->zbranch[i + 1];
+ if (znode->zbranch[i].znode)
+ znode->zbranch[i].znode->iip = i;
+ }
+
+ /*
+ * If this is the root and it has only 1 child then
+ * collapse the tree.
+ */
+ if (!znode->parent) {
+ while (znode->child_cnt == 1 && znode->level != 0) {
+ zp = znode;
+ zbr = &znode->zbranch[0];
+ znode = get_znode(c, znode, 0);
+ if (IS_ERR(znode))
+ return PTR_ERR(znode);
+ znode = dirty_cow_znode(c, zbr);
+ if (IS_ERR(znode))
+ return PTR_ERR(znode);
+ znode->parent = NULL;
+ znode->iip = 0;
+ if (c->zroot.len) {
+ err = insert_old_idx(c, c->zroot.lnum,
+ c->zroot.offs);
+ if (err)
+ return err;
+ }
+ c->zroot.lnum = zbr->lnum;
+ c->zroot.offs = zbr->offs;
+ c->zroot.len = zbr->len;
+ c->zroot.znode = znode;
+ ubifs_assert(!test_bit(OBSOLETE_ZNODE,
+ &zp->flags));
+ ubifs_assert(test_bit(DIRTY_ZNODE, &zp->flags));
+ atomic_long_dec(&c->dirty_zn_cnt);
+
+ if (zp->cnext) {
+ __set_bit(OBSOLETE_ZNODE, &zp->flags);
+ atomic_long_inc(&c->clean_zn_cnt);
+ atomic_long_inc(&ubifs_clean_zn_cnt);
+ } else
+ kfree(zp);
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * ubifs_tnc_remove - remove an index entry of a node.
+ * @c: UBIFS file-system description object
+ * @key: key of node
+ *
+ * Returns %0 on success or negative error code on failure.
+ */
+int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key)
+{
+ int found, n, err = 0;
+ struct ubifs_znode *znode;
+
+ mutex_lock(&c->tnc_mutex);
+ dbg_tnc("key %s", DBGKEY(key));
+ found = lookup_level0_dirty(c, key, &znode, &n);
+ if (found < 0) {
+ err = found;
+ goto out_unlock;
+ }
+ if (found == 1)
+ err = tnc_delete(c, znode, n);
+ if (!err)
+ err = dbg_check_tnc(c, 0);
+
+out_unlock:
+ mutex_unlock(&c->tnc_mutex);
+ return err;
+}
+
+/**
+ * ubifs_tnc_remove_nm - remove an index entry for a "hashed" node.
+ * @c: UBIFS file-system description object
+ * @key: key of node
+ * @nm: directory entry name
+ *
+ * Returns %0 on success or negative error code on failure.
+ */
+int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key,
+ const struct qstr *nm)
+{
+ int n, err;
+ struct ubifs_znode *znode;
+
+ mutex_lock(&c->tnc_mutex);
+ dbg_tnc("%.*s, key %s", nm->len, nm->name, DBGKEY(key));
+ err = lookup_level0_dirty(c, key, &znode, &n);
+ if (err < 0)
+ goto out_unlock;
+
+ if (err) {
+ if (c->replaying)
+ err = fallible_resolve_collision(c, key, &znode, &n,
+ nm, 0);
+ else
+ err = resolve_collision(c, key, &znode, &n, nm);
+ dbg_tnc("rc returned %d, znode %p, n %d", err, znode, n);
+ if (err < 0)
+ goto out_unlock;
+ if (err) {
+ /* Ensure the znode is dirtied */
+ if (znode->cnext || !ubifs_zn_dirty(znode)) {
+ znode = dirty_cow_bottom_up(c, znode);
+ if (IS_ERR(znode)) {
+ err = PTR_ERR(znode);
+ goto out_unlock;
+ }
+ }
+ err = tnc_delete(c, znode, n);
+ }
+ }
+
+out_unlock:
+ if (!err)
+ err = dbg_check_tnc(c, 0);
+ mutex_unlock(&c->tnc_mutex);
+ return err;
+}
+
+/**
+ * key_in_range - determine if a key falls within a range of keys.
+ * @c: UBIFS file-system description object
+ * @key: key to check
+ * @from_key: lowest key in range
+ * @to_key: highest key in range
+ *
+ * This function returns %1 if the key is in range and %0 otherwise.
+ */
+static int key_in_range(struct ubifs_info *c, union ubifs_key *key,
+ union ubifs_key *from_key, union ubifs_key *to_key)
+{
+ if (keys_cmp(c, key, from_key) < 0)
+ return 0;
+ if (keys_cmp(c, key, to_key) > 0)
+ return 0;
+ return 1;
+}
+
+/**
+ * ubifs_tnc_remove_range - remove index entries in range.
+ * @c: UBIFS file-system description object
+ * @from_key: lowest key to remove
+ * @to_key: highest key to remove
+ *
+ * This function removes index entries starting at @from_key and ending at
+ * @to_key. This function returns zero in case of success and a negative error
+ * code in case of failure.
+ */
+int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key,
+ union ubifs_key *to_key)
+{
+ int i, n, k, err = 0;
+ struct ubifs_znode *znode;
+ union ubifs_key *key;
+
+ mutex_lock(&c->tnc_mutex);
+ while (1) {
+ /* Find first level 0 znode that contains keys to remove */
+ err = ubifs_lookup_level0(c, from_key, &znode, &n);
+ if (err < 0)
+ goto out_unlock;
+
+ if (err)
+ key = from_key;
+ else {
+ err = tnc_next(c, &znode, &n);
+ if (err == -ENOENT) {
+ err = 0;
+ goto out_unlock;
+ }
+ if (err < 0)
+ goto out_unlock;
+ key = &znode->zbranch[n].key;
+ if (!key_in_range(c, key, from_key, to_key)) {
+ err = 0;
+ goto out_unlock;
+ }
+ }
+
+ /* Ensure the znode is dirtied */
+ if (znode->cnext || !ubifs_zn_dirty(znode)) {
+ znode = dirty_cow_bottom_up(c, znode);
+ if (IS_ERR(znode)) {
+ err = PTR_ERR(znode);
+ goto out_unlock;
+ }
+ }
+
+ /* Remove all keys in range except the first */
+ for (i = n + 1, k = 0; i < znode->child_cnt; i++, k++) {
+ key = &znode->zbranch[i].key;
+ if (!key_in_range(c, key, from_key, to_key))
+ break;
+ lnc_free(&znode->zbranch[i]);
+ err = ubifs_add_dirt(c, znode->zbranch[i].lnum,
+ znode->zbranch[i].len);
+ if (err) {
+ dbg_dump_znode(c, znode);
+ goto out_unlock;
+ }
+ dbg_tnc("removing %s", DBGKEY(key));
+ }
+ if (k) {
+ for (i = n + 1 + k; i < znode->child_cnt; i++)
+ znode->zbranch[i - k] = znode->zbranch[i];
+ znode->child_cnt -= k;
+ }
+
+ /* Now delete the first */
+ err = tnc_delete(c, znode, n);
+ if (err)
+ goto out_unlock;
+ }
+
+out_unlock:
+ if (!err)
+ err = dbg_check_tnc(c, 0);
+ mutex_unlock(&c->tnc_mutex);
+ return err;
+}
+
+/**
+ * ubifs_tnc_remove_ino - remove an inode from TNC.
+ * @c: UBIFS file-system description object
+ * @inum: inode number to remove
+ *
+ * This function remove inode @inum and all the extended attributes associated
+ * with the anode from TNC and returns zero in case of success or a negative
+ * error code in case of failure.
+ */
+int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum)
+{
+ union ubifs_key key1, key2;
+ struct ubifs_dent_node *xent, *pxent = NULL;
+ struct qstr nm = { .name = NULL };
+
+ dbg_tnc("ino %lu", inum);
+
+ /*
+ * Walk all extended attribute entries and remove them together with
+ * corresponding extended attribute inodes.
+ */
+ lowest_xent_key(c, &key1, inum);
+ while (1) {
+ ino_t xattr_inum;
+ int err;
+
+ xent = ubifs_tnc_next_ent(c, &key1, &nm);
+ if (IS_ERR(xent)) {
+ err = PTR_ERR(xent);
+ if (err == -ENOENT)
+ break;
+ return err;
+ }
+
+ xattr_inum = le64_to_cpu(xent->inum);
+ dbg_tnc("xent '%s', ino %lu", xent->name, xattr_inum);
+
+ nm.name = xent->name;
+ nm.len = le16_to_cpu(xent->nlen);
+ err = ubifs_tnc_remove_nm(c, &key1, &nm);
+ if (err) {
+ kfree(xent);
+ return err;
+ }
+
+ lowest_ino_key(c, &key1, xattr_inum);
+ highest_ino_key(c, &key2, xattr_inum);
+ err = ubifs_tnc_remove_range(c, &key1, &key2);
+ if (err) {
+ kfree(xent);
+ return err;
+ }
+
+ kfree(pxent);
+ pxent = xent;
+ key_read(c, &xent->key, &key1);
+ }
+
+ kfree(pxent);
+ lowest_ino_key(c, &key1, inum);
+ highest_ino_key(c, &key2, inum);
+
+ return ubifs_tnc_remove_range(c, &key1, &key2);
+}
+
+/**
+ * ubifs_tnc_next_ent - walk directory or extended attribute entries.
+ * @c: UBIFS file-system description object
+ * @key: key of last entry
+ * @nm: name of last entry found or %NULL
+ *
+ * This function finds and reads the next directory or extended attribute entry
+ * after the given key (@key) if there is one. @nm is used to resolve
+ * collisions.
+ *
+ * If the name of the current entry is not known and only the key is known,
+ * @nm->name has to be %NULL. In this case the semantics of this function is a
+ * little bit different and it returns the entry corresponding to this key, not
+ * the next one. If the key was not found, the closest "right" entry is
+ * returned.
+ *
+ * If the fist entry has to be found, @key has to contain the lowest possible
+ * key value for this inode and @name has to be %NULL.
+ *
+ * This function returns the found directory or extended attribute entry node
+ * in case of success, %-ENOENT is returned if no entry was found, and a
+ * negative error code is returned in case of failure.
+ */
+struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c,
+ union ubifs_key *key,
+ const struct qstr *nm)
+{
+ int n, err, type = key_type(c, key);
+ struct ubifs_znode *znode;
+ struct ubifs_dent_node *dent;
+ struct ubifs_zbranch *zbr;
+ union ubifs_key *dkey;
+
+ dbg_tnc("%s %s", nm->name ? (char *)nm->name : "(lowest)", DBGKEY(key));
+ ubifs_assert(is_hash_key(c, key));
+
+ mutex_lock(&c->tnc_mutex);
+ err = ubifs_lookup_level0(c, key, &znode, &n);
+ if (unlikely(err < 0))
+ goto out_unlock;
+
+ if (nm->name) {
+ if (err) {
+ /* Handle collisions */
+ err = resolve_collision(c, key, &znode, &n, nm);
+ dbg_tnc("rc returned %d, znode %p, n %d",
+ err, znode, n);
+ if (unlikely(err < 0))
+ goto out_unlock;
+ }
+
+ /* Now find next entry */
+ err = tnc_next(c, &znode, &n);
+ if (unlikely(err))
+ goto out_unlock;
+ } else {
+ /*
+ * The full name of the entry was not given, in which case the
+ * behavior of this function is a little different and it
+ * returns current entry, not the next one.
+ */
+ if (!err) {
+ /*
+ * However, the given key does not exist in the TNC
+ * tree and @znode/@n variables contain the closest
+ * "preceding" element. Switch to the next one.
+ */
+ err = tnc_next(c, &znode, &n);
+ if (err)
+ goto out_unlock;
+ }
+ }
+
+ zbr = &znode->zbranch[n];
+ dent = kmalloc(zbr->len, GFP_NOFS);
+ if (unlikely(!dent)) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ /*
+ * The above 'tnc_next()' call could lead us to the next inode, check
+ * this.
+ */
+ dkey = &zbr->key;
+ if (key_inum(c, dkey) != key_inum(c, key) ||
+ key_type(c, dkey) != type) {
+ err = -ENOENT;
+ goto out_free;
+ }
+
+ err = tnc_read_node_nm(c, zbr, dent);
+ if (unlikely(err))
+ goto out_free;
+
+ mutex_unlock(&c->tnc_mutex);
+ return dent;
+
+out_free:
+ kfree(dent);
+out_unlock:
+ mutex_unlock(&c->tnc_mutex);
+ return ERR_PTR(err);
+}
+
+/**
+ * tnc_destroy_cnext - destroy left-over obsolete znodes from a failed commit.
+ * @c: UBIFS file-system description object
+ *
+ * Destroy left-over obsolete znodes from a failed commit.
+ */
+static void tnc_destroy_cnext(struct ubifs_info *c)
+{
+ struct ubifs_znode *cnext;
+
+ if (!c->cnext)
+ return;
+ ubifs_assert(c->cmt_state == COMMIT_BROKEN);
+ cnext = c->cnext;
+ do {
+ struct ubifs_znode *znode = cnext;
+
+ cnext = cnext->cnext;
+ if (test_bit(OBSOLETE_ZNODE, &znode->flags))
+ kfree(znode);
+ } while (cnext && cnext != c->cnext);
+}
+
+/**
+ * ubifs_tnc_close - close TNC subsystem and free all related resources.
+ * @c: UBIFS file-system description object
+ */
+void ubifs_tnc_close(struct ubifs_info *c)
+{
+ long clean_freed;
+
+ tnc_destroy_cnext(c);
+ if (c->zroot.znode) {
+ clean_freed = ubifs_destroy_tnc_subtree(c->zroot.znode);
+ atomic_long_sub(clean_freed, &ubifs_clean_zn_cnt);
+ }
+ kfree(c->gap_lebs);
+ kfree(c->ilebs);
+ destroy_old_idx(c);
+}
+
+/**
+ * left_znode - get the znode to the left.
+ * @c: UBIFS file-system description object
+ * @znode: znode
+ *
+ * This function returns a pointer to the znode to the left of @znode or NULL if
+ * there is not one. A negative error code is returned on failure.
+ */
+static struct ubifs_znode *left_znode(struct ubifs_info *c,
+ struct ubifs_znode *znode)
+{
+ int level = znode->level;
+
+ while (1) {
+ int n = znode->iip - 1;
+
+ /* Go up until we can go left */
+ znode = znode->parent;
+ if (!znode)
+ return NULL;
+ if (n >= 0) {
+ /* Now go down the rightmost branch to 'level' */
+ znode = get_znode(c, znode, n);
+ if (IS_ERR(znode))
+ return znode;
+ while (znode->level != level) {
+ n = znode->child_cnt - 1;
+ znode = get_znode(c, znode, n);
+ if (IS_ERR(znode))
+ return znode;
+ }
+ break;
+ }
+ }
+ return znode;
+}
+
+/**
+ * right_znode - get the znode to the right.
+ * @c: UBIFS file-system description object
+ * @znode: znode
+ *
+ * This function returns a pointer to the znode to the right of @znode or NULL
+ * if there is not one. A negative error code is returned on failure.
+ */
+static struct ubifs_znode *right_znode(struct ubifs_info *c,
+ struct ubifs_znode *znode)
+{
+ int level = znode->level;
+
+ while (1) {
+ int n = znode->iip + 1;
+
+ /* Go up until we can go right */
+ znode = znode->parent;
+ if (!znode)
+ return NULL;
+ if (n < znode->child_cnt) {
+ /* Now go down the leftmost branch to 'level' */
+ znode = get_znode(c, znode, n);
+ if (IS_ERR(znode))
+ return znode;
+ while (znode->level != level) {
+ znode = get_znode(c, znode, 0);
+ if (IS_ERR(znode))
+ return znode;
+ }
+ break;
+ }
+ }
+ return znode;
+}
+
+/**
+ * lookup_znode - find a particular indexing node from TNC.
+ * @c: UBIFS file-system description object
+ * @key: index node key to lookup
+ * @level: index node level
+ * @lnum: index node LEB number
+ * @offs: index node offset
+ *
+ * This function searches an indexing node by its first key @key and its
+ * address @lnum:@offs. It looks up the indexing tree by pulling all indexing
+ * nodes it traverses to TNC. This function is called fro indexing nodes which
+ * were found on the media by scanning, for example when garbage-collecting or
+ * when doing in-the-gaps commit. This means that the indexing node which is
+ * looked for does not have to have exactly the same leftmost key @key, because
+ * the leftmost key may have been changed, in which case TNC will contain a
+ * dirty znode which still refers the same @lnum:@offs. This function is clever
+ * enough to recognize such indexing nodes.
+ *
+ * Note, if a znode was deleted or changed too much, then this function will
+ * not find it. For situations like this UBIFS has the old index RB-tree
+ * (indexed by @lnum:@offs).
+ *
+ * This function returns a pointer to the znode found or %NULL if it is not
+ * found. A negative error code is returned on failure.
+ */
+static struct ubifs_znode *lookup_znode(struct ubifs_info *c,
+ union ubifs_key *key, int level,
+ int lnum, int offs)
+{
+ struct ubifs_znode *znode, *zn;
+ int n, nn;
+
+ /*
+ * The arguments have probably been read off flash, so don't assume
+ * they are valid.
+ */
+ if (level < 0)
+ return ERR_PTR(-EINVAL);
+
+ /* Get the root znode */
+ znode = c->zroot.znode;
+ if (!znode) {
+ znode = ubifs_load_znode(c, &c->zroot, NULL, 0);
+ if (IS_ERR(znode))
+ return znode;
+ }
+ /* Check if it is the one we are looking for */
+ if (c->zroot.lnum == lnum && c->zroot.offs == offs)
+ return znode;
+ /* Descend to the parent level i.e. (level + 1) */
+ if (level >= znode->level)
+ return NULL;
+ while (1) {
+ ubifs_search_zbranch(c, znode, key, &n);
+ if (n < 0) {
+ /*
+ * We reached a znode where the leftmost key is greater
+ * than the key we are searching for. This is the same
+ * situation as the one described in a huge comment at
+ * the end of the 'ubifs_lookup_level0()' function. And
+ * for exactly the same reasons we have to try to look
+ * left before giving up.
+ */
+ znode = left_znode(c, znode);
+ if (!znode)
+ return NULL;
+ if (IS_ERR(znode))
+ return znode;
+ ubifs_search_zbranch(c, znode, key, &n);
+ ubifs_assert(n >= 0);
+ }
+ if (znode->level == level + 1)
+ break;
+ znode = get_znode(c, znode, n);
+ if (IS_ERR(znode))
+ return znode;
+ }
+ /* Check if the child is the one we are looking for */
+ if (znode->zbranch[n].lnum == lnum && znode->zbranch[n].offs == offs)
+ return get_znode(c, znode, n);
+ /* If the key is unique, there is nowhere else to look */
+ if (!is_hash_key(c, key))
+ return NULL;
+ /*
+ * The key is not unique and so may be also in the znodes to either
+ * side.
+ */
+ zn = znode;
+ nn = n;
+ /* Look left */
+ while (1) {
+ /* Move one branch to the left */
+ if (n)
+ n -= 1;
+ else {
+ znode = left_znode(c, znode);
+ if (!znode)
+ break;
+ if (IS_ERR(znode))
+ return znode;
+ n = znode->child_cnt - 1;
+ }
+ /* Check it */
+ if (znode->zbranch[n].lnum == lnum &&
+ znode->zbranch[n].offs == offs)
+ return get_znode(c, znode, n);
+ /* Stop if the key is less than the one we are looking for */
+ if (keys_cmp(c, &znode->zbranch[n].key, key) < 0)
+ break;
+ }
+ /* Back to the middle */
+ znode = zn;
+ n = nn;
+ /* Look right */
+ while (1) {
+ /* Move one branch to the right */
+ if (++n >= znode->child_cnt) {
+ znode = right_znode(c, znode);
+ if (!znode)
+ break;
+ if (IS_ERR(znode))
+ return znode;
+ n = 0;
+ }
+ /* Check it */
+ if (znode->zbranch[n].lnum == lnum &&
+ znode->zbranch[n].offs == offs)
+ return get_znode(c, znode, n);
+ /* Stop if the key is greater than the one we are looking for */
+ if (keys_cmp(c, &znode->zbranch[n].key, key) > 0)
+ break;
+ }
+ return NULL;
+}
+
+/**
+ * is_idx_node_in_tnc - determine if an index node is in the TNC.
+ * @c: UBIFS file-system description object
+ * @key: key of index node
+ * @level: index node level
+ * @lnum: LEB number of index node
+ * @offs: offset of index node
+ *
+ * This function returns %0 if the index node is not referred to in the TNC, %1
+ * if the index node is referred to in the TNC and the corresponding znode is
+ * dirty, %2 if an index node is referred to in the TNC and the corresponding
+ * znode is clean, and a negative error code in case of failure.
+ *
+ * Note, the @key argument has to be the key of the first child. Also note,
+ * this function relies on the fact that 0:0 is never a valid LEB number and
+ * offset for a main-area node.
+ */
+int is_idx_node_in_tnc(struct ubifs_info *c, union ubifs_key *key, int level,
+ int lnum, int offs)
+{
+ struct ubifs_znode *znode;
+
+ znode = lookup_znode(c, key, level, lnum, offs);
+ if (!znode)
+ return 0;
+ if (IS_ERR(znode))
+ return PTR_ERR(znode);
+
+ return ubifs_zn_dirty(znode) ? 1 : 2;
+}
+
+/**
+ * is_leaf_node_in_tnc - determine if a non-indexing not is in the TNC.
+ * @c: UBIFS file-system description object
+ * @key: node key
+ * @lnum: node LEB number
+ * @offs: node offset
+ *
+ * This function returns %1 if the node is referred to in the TNC, %0 if it is
+ * not, and a negative error code in case of failure.
+ *
+ * Note, this function relies on the fact that 0:0 is never a valid LEB number
+ * and offset for a main-area node.
+ */
+static int is_leaf_node_in_tnc(struct ubifs_info *c, union ubifs_key *key,
+ int lnum, int offs)
+{
+ struct ubifs_zbranch *zbr;
+ struct ubifs_znode *znode, *zn;
+ int n, found, err, nn;
+ const int unique = !is_hash_key(c, key);
+
+ found = ubifs_lookup_level0(c, key, &znode, &n);
+ if (found < 0)
+ return found; /* Error code */
+ if (!found)
+ return 0;
+ zbr = &znode->zbranch[n];
+ if (lnum == zbr->lnum && offs == zbr->offs)
+ return 1; /* Found it */
+ if (unique)
+ return 0;
+ /*
+ * Because the key is not unique, we have to look left
+ * and right as well
+ */
+ zn = znode;
+ nn = n;
+ /* Look left */
+ while (1) {
+ err = tnc_prev(c, &znode, &n);
+ if (err == -ENOENT)
+ break;
+ if (err)
+ return err;
+ if (keys_cmp(c, key, &znode->zbranch[n].key))
+ break;
+ zbr = &znode->zbranch[n];
+ if (lnum == zbr->lnum && offs == zbr->offs)
+ return 1; /* Found it */
+ }
+ /* Look right */
+ znode = zn;
+ n = nn;
+ while (1) {
+ err = tnc_next(c, &znode, &n);
+ if (err) {
+ if (err == -ENOENT)
+ return 0;
+ return err;
+ }
+ if (keys_cmp(c, key, &znode->zbranch[n].key))
+ break;
+ zbr = &znode->zbranch[n];
+ if (lnum == zbr->lnum && offs == zbr->offs)
+ return 1; /* Found it */
+ }
+ return 0;
+}
+
+/**
+ * ubifs_tnc_has_node - determine whether a node is in the TNC.
+ * @c: UBIFS file-system description object
+ * @key: node key
+ * @level: index node level (if it is an index node)
+ * @lnum: node LEB number
+ * @offs: node offset
+ * @is_idx: non-zero if the node is an index node
+ *
+ * This function returns %1 if the node is in the TNC, %0 if it is not, and a
+ * negative error code in case of failure. For index nodes, @key has to be the
+ * key of the first child. An index node is considered to be in the TNC only if
+ * the corresponding znode is clean or has not been loaded.
+ */
+int ubifs_tnc_has_node(struct ubifs_info *c, union ubifs_key *key, int level,
+ int lnum, int offs, int is_idx)
+{
+ int err;
+
+ mutex_lock(&c->tnc_mutex);
+ if (is_idx) {
+ err = is_idx_node_in_tnc(c, key, level, lnum, offs);
+ if (err < 0)
+ goto out_unlock;
+ if (err == 1)
+ /* The index node was found but it was dirty */
+ err = 0;
+ else if (err == 2)
+ /* The index node was found and it was clean */
+ err = 1;
+ else
+ BUG_ON(err != 0);
+ } else
+ err = is_leaf_node_in_tnc(c, key, lnum, offs);
+
+out_unlock:
+ mutex_unlock(&c->tnc_mutex);
+ return err;
+}
+
+/**
+ * ubifs_dirty_idx_node - dirty an index node.
+ * @c: UBIFS file-system description object
+ * @key: index node key
+ * @level: index node level
+ * @lnum: index node LEB number
+ * @offs: index node offset
+ *
+ * This function loads and dirties an index node so that it can be garbage
+ * collected. The @key argument has to be the key of the first child. This
+ * function relies on the fact that 0:0 is never a valid LEB number and offset
+ * for a main-area node. Returns %0 on success and a negative error code on
+ * failure.
+ */
+int ubifs_dirty_idx_node(struct ubifs_info *c, union ubifs_key *key, int level,
+ int lnum, int offs)
+{
+ struct ubifs_znode *znode;
+ int err = 0;
+
+ mutex_lock(&c->tnc_mutex);
+ znode = lookup_znode(c, key, level, lnum, offs);
+ if (!znode)
+ goto out_unlock;
+ if (IS_ERR(znode)) {
+ err = PTR_ERR(znode);
+ goto out_unlock;
+ }
+ znode = dirty_cow_bottom_up(c, znode);
+ if (IS_ERR(znode)) {
+ err = PTR_ERR(znode);
+ goto out_unlock;
+ }
+
+out_unlock:
+ mutex_unlock(&c->tnc_mutex);
+ return err;
+}
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
new file mode 100644
index 00000000000..8ac76b1c2d5
--- /dev/null
+++ b/fs/ubifs/tnc_commit.c
@@ -0,0 +1,1102 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ * Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/* This file implements TNC functions for committing */
+
+#include "ubifs.h"
+
+/**
+ * make_idx_node - make an index node for fill-the-gaps method of TNC commit.
+ * @c: UBIFS file-system description object
+ * @idx: buffer in which to place new index node
+ * @znode: znode from which to make new index node
+ * @lnum: LEB number where new index node will be written
+ * @offs: offset where new index node will be written
+ * @len: length of new index node
+ */
+static int make_idx_node(struct ubifs_info *c, struct ubifs_idx_node *idx,
+ struct ubifs_znode *znode, int lnum, int offs, int len)
+{
+ struct ubifs_znode *zp;
+ int i, err;
+
+ /* Make index node */
+ idx->ch.node_type = UBIFS_IDX_NODE;
+ idx->child_cnt = cpu_to_le16(znode->child_cnt);
+ idx->level = cpu_to_le16(znode->level);
+ for (i = 0; i < znode->child_cnt; i++) {
+ struct ubifs_branch *br = ubifs_idx_branch(c, idx, i);
+ struct ubifs_zbranch *zbr = &znode->zbranch[i];
+
+ key_write_idx(c, &zbr->key, &br->key);
+ br->lnum = cpu_to_le32(zbr->lnum);
+ br->offs = cpu_to_le32(zbr->offs);
+ br->len = cpu_to_le32(zbr->len);
+ if (!zbr->lnum || !zbr->len) {
+ ubifs_err("bad ref in znode");
+ dbg_dump_znode(c, znode);
+ if (zbr->znode)
+ dbg_dump_znode(c, zbr->znode);
+ }
+ }
+ ubifs_prepare_node(c, idx, len, 0);
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+ znode->lnum = lnum;
+ znode->offs = offs;
+ znode->len = len;
+#endif
+
+ err = insert_old_idx_znode(c, znode);
+
+ /* Update the parent */
+ zp = znode->parent;
+ if (zp) {
+ struct ubifs_zbranch *zbr;
+
+ zbr = &zp->zbranch[znode->iip];
+ zbr->lnum = lnum;
+ zbr->offs = offs;
+ zbr->len = len;
+ } else {
+ c->zroot.lnum = lnum;
+ c->zroot.offs = offs;
+ c->zroot.len = len;
+ }
+ c->calc_idx_sz += ALIGN(len, 8);
+
+ atomic_long_dec(&c->dirty_zn_cnt);
+
+ ubifs_assert(ubifs_zn_dirty(znode));
+ ubifs_assert(test_bit(COW_ZNODE, &znode->flags));
+
+ __clear_bit(DIRTY_ZNODE, &znode->flags);
+ __clear_bit(COW_ZNODE, &znode->flags);
+
+ return err;
+}
+
+/**
+ * fill_gap - make index nodes in gaps in dirty index LEBs.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number that gap appears in
+ * @gap_start: offset of start of gap
+ * @gap_end: offset of end of gap
+ * @dirt: adds dirty space to this
+ *
+ * This function returns the number of index nodes written into the gap.
+ */
+static int fill_gap(struct ubifs_info *c, int lnum, int gap_start, int gap_end,
+ int *dirt)
+{
+ int len, gap_remains, gap_pos, written, pad_len;
+
+ ubifs_assert((gap_start & 7) == 0);
+ ubifs_assert((gap_end & 7) == 0);
+ ubifs_assert(gap_end >= gap_start);
+
+ gap_remains = gap_end - gap_start;
+ if (!gap_remains)
+ return 0;
+ gap_pos = gap_start;
+ written = 0;
+ while (c->enext) {
+ len = ubifs_idx_node_sz(c, c->enext->child_cnt);
+ if (len < gap_remains) {
+ struct ubifs_znode *znode = c->enext;
+ const int alen = ALIGN(len, 8);
+ int err;
+
+ ubifs_assert(alen <= gap_remains);
+ err = make_idx_node(c, c->ileb_buf + gap_pos, znode,
+ lnum, gap_pos, len);
+ if (err)
+ return err;
+ gap_remains -= alen;
+ gap_pos += alen;
+ c->enext = znode->cnext;
+ if (c->enext == c->cnext)
+ c->enext = NULL;
+ written += 1;
+ } else
+ break;
+ }
+ if (gap_end == c->leb_size) {
+ c->ileb_len = ALIGN(gap_pos, c->min_io_size);
+ /* Pad to end of min_io_size */
+ pad_len = c->ileb_len - gap_pos;
+ } else
+ /* Pad to end of gap */
+ pad_len = gap_remains;
+ dbg_gc("LEB %d:%d to %d len %d nodes written %d wasted bytes %d",
+ lnum, gap_start, gap_end, gap_end - gap_start, written, pad_len);
+ ubifs_pad(c, c->ileb_buf + gap_pos, pad_len);
+ *dirt += pad_len;
+ return written;
+}
+
+/**
+ * find_old_idx - find an index node obsoleted since the last commit start.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number of obsoleted index node
+ * @offs: offset of obsoleted index node
+ *
+ * Returns %1 if found and %0 otherwise.
+ */
+static int find_old_idx(struct ubifs_info *c, int lnum, int offs)
+{
+ struct ubifs_old_idx *o;
+ struct rb_node *p;
+
+ p = c->old_idx.rb_node;
+ while (p) {
+ o = rb_entry(p, struct ubifs_old_idx, rb);
+ if (lnum < o->lnum)
+ p = p->rb_left;
+ else if (lnum > o->lnum)
+ p = p->rb_right;
+ else if (offs < o->offs)
+ p = p->rb_left;
+ else if (offs > o->offs)
+ p = p->rb_right;
+ else
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * is_idx_node_in_use - determine if an index node can be overwritten.
+ * @c: UBIFS file-system description object
+ * @key: key of index node
+ * @level: index node level
+ * @lnum: LEB number of index node
+ * @offs: offset of index node
+ *
+ * If @key / @lnum / @offs identify an index node that was not part of the old
+ * index, then this function returns %0 (obsolete). Else if the index node was
+ * part of the old index but is now dirty %1 is returned, else if it is clean %2
+ * is returned. A negative error code is returned on failure.
+ */
+static int is_idx_node_in_use(struct ubifs_info *c, union ubifs_key *key,
+ int level, int lnum, int offs)
+{
+ int ret;
+
+ ret = is_idx_node_in_tnc(c, key, level, lnum, offs);
+ if (ret < 0)
+ return ret; /* Error code */
+ if (ret == 0)
+ if (find_old_idx(c, lnum, offs))
+ return 1;
+ return ret;
+}
+
+/**
+ * layout_leb_in_gaps - layout index nodes using in-the-gaps method.
+ * @c: UBIFS file-system description object
+ * @p: return LEB number here
+ *
+ * This function lays out new index nodes for dirty znodes using in-the-gaps
+ * method of TNC commit.
+ * This function merely puts the next znode into the next gap, making no attempt
+ * to try to maximise the number of znodes that fit.
+ * This function returns the number of index nodes written into the gaps, or a
+ * negative error code on failure.
+ */
+static int layout_leb_in_gaps(struct ubifs_info *c, int *p)
+{
+ struct ubifs_scan_leb *sleb;
+ struct ubifs_scan_node *snod;
+ int lnum, dirt = 0, gap_start, gap_end, err, written, tot_written;
+
+ tot_written = 0;
+ /* Get an index LEB with lots of obsolete index nodes */
+ lnum = ubifs_find_dirty_idx_leb(c);
+ if (lnum < 0)
+ /*
+ * There also may be dirt in the index head that could be
+ * filled, however we do not check there at present.
+ */
+ return lnum; /* Error code */
+ *p = lnum;
+ dbg_gc("LEB %d", lnum);
+ /*
+ * Scan the index LEB. We use the generic scan for this even though
+ * it is more comprehensive and less efficient than is needed for this
+ * purpose.
+ */
+ sleb = ubifs_scan(c, lnum, 0, c->ileb_buf);
+ c->ileb_len = 0;
+ if (IS_ERR(sleb))
+ return PTR_ERR(sleb);
+ gap_start = 0;
+ list_for_each_entry(snod, &sleb->nodes, list) {
+ struct ubifs_idx_node *idx;
+ int in_use, level;
+
+ ubifs_assert(snod->type == UBIFS_IDX_NODE);
+ idx = snod->node;
+ key_read(c, ubifs_idx_key(c, idx), &snod->key);
+ level = le16_to_cpu(idx->level);
+ /* Determine if the index node is in use (not obsolete) */
+ in_use = is_idx_node_in_use(c, &snod->key, level, lnum,
+ snod->offs);
+ if (in_use < 0) {
+ ubifs_scan_destroy(sleb);
+ return in_use; /* Error code */
+ }
+ if (in_use) {
+ if (in_use == 1)
+ dirt += ALIGN(snod->len, 8);
+ /*
+ * The obsolete index nodes form gaps that can be
+ * overwritten. This gap has ended because we have
+ * found an index node that is still in use
+ * i.e. not obsolete
+ */
+ gap_end = snod->offs;
+ /* Try to fill gap */
+ written = fill_gap(c, lnum, gap_start, gap_end, &dirt);
+ if (written < 0) {
+ ubifs_scan_destroy(sleb);
+ return written; /* Error code */
+ }
+ tot_written += written;
+ gap_start = ALIGN(snod->offs + snod->len, 8);
+ }
+ }
+ ubifs_scan_destroy(sleb);
+ c->ileb_len = c->leb_size;
+ gap_end = c->leb_size;
+ /* Try to fill gap */
+ written = fill_gap(c, lnum, gap_start, gap_end, &dirt);
+ if (written < 0)
+ return written; /* Error code */
+ tot_written += written;
+ if (tot_written == 0) {
+ struct ubifs_lprops lp;
+
+ dbg_gc("LEB %d wrote %d index nodes", lnum, tot_written);
+ err = ubifs_read_one_lp(c, lnum, &lp);
+ if (err)
+ return err;
+ if (lp.free == c->leb_size) {
+ /*
+ * We must have snatched this LEB from the idx_gc list
+ * so we need to correct the free and dirty space.
+ */
+ err = ubifs_change_one_lp(c, lnum,
+ c->leb_size - c->ileb_len,
+ dirt, 0, 0, 0);
+ if (err)
+ return err;
+ }
+ return 0;
+ }
+ err = ubifs_change_one_lp(c, lnum, c->leb_size - c->ileb_len, dirt,
+ 0, 0, 0);
+ if (err)
+ return err;
+ err = ubifs_leb_change(c, lnum, c->ileb_buf, c->ileb_len,
+ UBI_SHORTTERM);
+ if (err)
+ return err;
+ dbg_gc("LEB %d wrote %d index nodes", lnum, tot_written);
+ return tot_written;
+}
+
+/**
+ * get_leb_cnt - calculate the number of empty LEBs needed to commit.
+ * @c: UBIFS file-system description object
+ * @cnt: number of znodes to commit
+ *
+ * This function returns the number of empty LEBs needed to commit @cnt znodes
+ * to the current index head. The number is not exact and may be more than
+ * needed.
+ */
+static int get_leb_cnt(struct ubifs_info *c, int cnt)
+{
+ int d;
+
+ /* Assume maximum index node size (i.e. overestimate space needed) */
+ cnt -= (c->leb_size - c->ihead_offs) / c->max_idx_node_sz;
+ if (cnt < 0)
+ cnt = 0;
+ d = c->leb_size / c->max_idx_node_sz;
+ return DIV_ROUND_UP(cnt, d);
+}
+
+/**
+ * layout_in_gaps - in-the-gaps method of committing TNC.
+ * @c: UBIFS file-system description object
+ * @cnt: number of dirty znodes to commit.
+ *
+ * This function lays out new index nodes for dirty znodes using in-the-gaps
+ * method of TNC commit.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int layout_in_gaps(struct ubifs_info *c, int cnt)
+{
+ int err, leb_needed_cnt, written, *p;
+
+ dbg_gc("%d znodes to write", cnt);
+
+ c->gap_lebs = kmalloc(sizeof(int) * (c->lst.idx_lebs + 1), GFP_NOFS);
+ if (!c->gap_lebs)
+ return -ENOMEM;
+
+ p = c->gap_lebs;
+ do {
+ ubifs_assert(p < c->gap_lebs + sizeof(int) * c->lst.idx_lebs);
+ written = layout_leb_in_gaps(c, p);
+ if (written < 0) {
+ err = written;
+ if (err != -ENOSPC) {
+ kfree(c->gap_lebs);
+ c->gap_lebs = NULL;
+ return err;
+ }
+ if (!dbg_force_in_the_gaps_enabled) {
+ /*
+ * Do not print scary warnings if the debugging
+ * option which forces in-the-gaps is enabled.
+ */
+ ubifs_err("out of space");
+ spin_lock(&c->space_lock);
+ dbg_dump_budg(c);
+ spin_unlock(&c->space_lock);
+ dbg_dump_lprops(c);
+ }
+ /* Try to commit anyway */
+ err = 0;
+ break;
+ }
+ p++;
+ cnt -= written;
+ leb_needed_cnt = get_leb_cnt(c, cnt);
+ dbg_gc("%d znodes remaining, need %d LEBs, have %d", cnt,
+ leb_needed_cnt, c->ileb_cnt);
+ } while (leb_needed_cnt > c->ileb_cnt);
+
+ *p = -1;
+ return 0;
+}
+
+/**
+ * layout_in_empty_space - layout index nodes in empty space.
+ * @c: UBIFS file-system description object
+ *
+ * This function lays out new index nodes for dirty znodes using empty LEBs.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int layout_in_empty_space(struct ubifs_info *c)
+{
+ struct ubifs_znode *znode, *cnext, *zp;
+ int lnum, offs, len, next_len, buf_len, buf_offs, used, avail;
+ int wlen, blen, err;
+
+ cnext = c->enext;
+ if (!cnext)
+ return 0;
+
+ lnum = c->ihead_lnum;
+ buf_offs = c->ihead_offs;
+
+ buf_len = ubifs_idx_node_sz(c, c->fanout);
+ buf_len = ALIGN(buf_len, c->min_io_size);
+ used = 0;
+ avail = buf_len;
+
+ /* Ensure there is enough room for first write */
+ next_len = ubifs_idx_node_sz(c, cnext->child_cnt);
+ if (buf_offs + next_len > c->leb_size)
+ lnum = -1;
+
+ while (1) {
+ znode = cnext;
+
+ len = ubifs_idx_node_sz(c, znode->child_cnt);
+
+ /* Determine the index node position */
+ if (lnum == -1) {
+ if (c->ileb_nxt >= c->ileb_cnt) {
+ ubifs_err("out of space");
+ return -ENOSPC;
+ }
+ lnum = c->ilebs[c->ileb_nxt++];
+ buf_offs = 0;
+ used = 0;
+ avail = buf_len;
+ }
+
+ offs = buf_offs + used;
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+ znode->lnum = lnum;
+ znode->offs = offs;
+ znode->len = len;
+#endif
+
+ /* Update the parent */
+ zp = znode->parent;
+ if (zp) {
+ struct ubifs_zbranch *zbr;
+ int i;
+
+ i = znode->iip;
+ zbr = &zp->zbranch[i];
+ zbr->lnum = lnum;
+ zbr->offs = offs;
+ zbr->len = len;
+ } else {
+ c->zroot.lnum = lnum;
+ c->zroot.offs = offs;
+ c->zroot.len = len;
+ }
+ c->calc_idx_sz += ALIGN(len, 8);
+
+ /*
+ * Once lprops is updated, we can decrease the dirty znode count
+ * but it is easier to just do it here.
+ */
+ atomic_long_dec(&c->dirty_zn_cnt);
+
+ /*
+ * Calculate the next index node length to see if there is
+ * enough room for it
+ */
+ cnext = znode->cnext;
+ if (cnext == c->cnext)
+ next_len = 0;
+ else
+ next_len = ubifs_idx_node_sz(c, cnext->child_cnt);
+
+ if (c->min_io_size == 1) {
+ buf_offs += ALIGN(len, 8);
+ if (next_len) {
+ if (buf_offs + next_len <= c->leb_size)
+ continue;
+ err = ubifs_update_one_lp(c, lnum, 0,
+ c->leb_size - buf_offs, 0, 0);
+ if (err)
+ return err;
+ lnum = -1;
+ continue;
+ }
+ err = ubifs_update_one_lp(c, lnum,
+ c->leb_size - buf_offs, 0, 0, 0);
+ if (err)
+ return err;
+ break;
+ }
+
+ /* Update buffer positions */
+ wlen = used + len;
+ used += ALIGN(len, 8);
+ avail -= ALIGN(len, 8);
+
+ if (next_len != 0 &&
+ buf_offs + used + next_len <= c->leb_size &&
+ avail > 0)
+ continue;
+
+ if (avail <= 0 && next_len &&
+ buf_offs + used + next_len <= c->leb_size)
+ blen = buf_len;
+ else
+ blen = ALIGN(wlen, c->min_io_size);
+
+ /* The buffer is full or there are no more znodes to do */
+ buf_offs += blen;
+ if (next_len) {
+ if (buf_offs + next_len > c->leb_size) {
+ err = ubifs_update_one_lp(c, lnum,
+ c->leb_size - buf_offs, blen - used,
+ 0, 0);
+ if (err)
+ return err;
+ lnum = -1;
+ }
+ used -= blen;
+ if (used < 0)
+ used = 0;
+ avail = buf_len - used;
+ continue;
+ }
+ err = ubifs_update_one_lp(c, lnum, c->leb_size - buf_offs,
+ blen - used, 0, 0);
+ if (err)
+ return err;
+ break;
+ }
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+ c->new_ihead_lnum = lnum;
+ c->new_ihead_offs = buf_offs;
+#endif
+
+ return 0;
+}
+
+/**
+ * layout_commit - determine positions of index nodes to commit.
+ * @c: UBIFS file-system description object
+ * @no_space: indicates that insufficient empty LEBs were allocated
+ * @cnt: number of znodes to commit
+ *
+ * Calculate and update the positions of index nodes to commit. If there were
+ * an insufficient number of empty LEBs allocated, then index nodes are placed
+ * into the gaps created by obsolete index nodes in non-empty index LEBs. For
+ * this purpose, an obsolete index node is one that was not in the index as at
+ * the end of the last commit. To write "in-the-gaps" requires that those index
+ * LEBs are updated atomically in-place.
+ */
+static int layout_commit(struct ubifs_info *c, int no_space, int cnt)
+{
+ int err;
+
+ if (no_space) {
+ err = layout_in_gaps(c, cnt);
+ if (err)
+ return err;
+ }
+ err = layout_in_empty_space(c);
+ return err;
+}
+
+/**
+ * find_first_dirty - find first dirty znode.
+ * @znode: znode to begin searching from
+ */
+static struct ubifs_znode *find_first_dirty(struct ubifs_znode *znode)
+{
+ int i, cont;
+
+ if (!znode)
+ return NULL;
+
+ while (1) {
+ if (znode->level == 0) {
+ if (ubifs_zn_dirty(znode))
+ return znode;
+ return NULL;
+ }
+ cont = 0;
+ for (i = 0; i < znode->child_cnt; i++) {
+ struct ubifs_zbranch *zbr = &znode->zbranch[i];
+
+ if (zbr->znode && ubifs_zn_dirty(zbr->znode)) {
+ znode = zbr->znode;
+ cont = 1;
+ break;
+ }
+ }
+ if (!cont) {
+ if (ubifs_zn_dirty(znode))
+ return znode;
+ return NULL;
+ }
+ }
+}
+
+/**
+ * find_next_dirty - find next dirty znode.
+ * @znode: znode to begin searching from
+ */
+static struct ubifs_znode *find_next_dirty(struct ubifs_znode *znode)
+{
+ int n = znode->iip + 1;
+
+ znode = znode->parent;
+ if (!znode)
+ return NULL;
+ for (; n < znode->child_cnt; n++) {
+ struct ubifs_zbranch *zbr = &znode->zbranch[n];
+
+ if (zbr->znode && ubifs_zn_dirty(zbr->znode))
+ return find_first_dirty(zbr->znode);
+ }
+ return znode;
+}
+
+/**
+ * get_znodes_to_commit - create list of dirty znodes to commit.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns the number of znodes to commit.
+ */
+static int get_znodes_to_commit(struct ubifs_info *c)
+{
+ struct ubifs_znode *znode, *cnext;
+ int cnt = 0;
+
+ c->cnext = find_first_dirty(c->zroot.znode);
+ znode = c->enext = c->cnext;
+ if (!znode) {
+ dbg_cmt("no znodes to commit");
+ return 0;
+ }
+ cnt += 1;
+ while (1) {
+ ubifs_assert(!test_bit(COW_ZNODE, &znode->flags));
+ __set_bit(COW_ZNODE, &znode->flags);
+ znode->alt = 0;
+ cnext = find_next_dirty(znode);
+ if (!cnext) {
+ znode->cnext = c->cnext;
+ break;
+ }
+ znode->cnext = cnext;
+ znode = cnext;
+ cnt += 1;
+ }
+ dbg_cmt("committing %d znodes", cnt);
+ ubifs_assert(cnt == atomic_long_read(&c->dirty_zn_cnt));
+ return cnt;
+}
+
+/**
+ * alloc_idx_lebs - allocate empty LEBs to be used to commit.
+ * @c: UBIFS file-system description object
+ * @cnt: number of znodes to commit
+ *
+ * This function returns %-ENOSPC if it cannot allocate a sufficient number of
+ * empty LEBs. %0 is returned on success, otherwise a negative error code
+ * is returned.
+ */
+static int alloc_idx_lebs(struct ubifs_info *c, int cnt)
+{
+ int i, leb_cnt, lnum;
+
+ c->ileb_cnt = 0;
+ c->ileb_nxt = 0;
+ leb_cnt = get_leb_cnt(c, cnt);
+ dbg_cmt("need about %d empty LEBS for TNC commit", leb_cnt);
+ if (!leb_cnt)
+ return 0;
+ c->ilebs = kmalloc(leb_cnt * sizeof(int), GFP_NOFS);
+ if (!c->ilebs)
+ return -ENOMEM;
+ for (i = 0; i < leb_cnt; i++) {
+ lnum = ubifs_find_free_leb_for_idx(c);
+ if (lnum < 0)
+ return lnum;
+ c->ilebs[c->ileb_cnt++] = lnum;
+ dbg_cmt("LEB %d", lnum);
+ }
+ if (dbg_force_in_the_gaps())
+ return -ENOSPC;
+ return 0;
+}
+
+/**
+ * free_unused_idx_lebs - free unused LEBs that were allocated for the commit.
+ * @c: UBIFS file-system description object
+ *
+ * It is possible that we allocate more empty LEBs for the commit than we need.
+ * This functions frees the surplus.
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int free_unused_idx_lebs(struct ubifs_info *c)
+{
+ int i, err = 0, lnum, er;
+
+ for (i = c->ileb_nxt; i < c->ileb_cnt; i++) {
+ lnum = c->ilebs[i];
+ dbg_cmt("LEB %d", lnum);
+ er = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0,
+ LPROPS_INDEX | LPROPS_TAKEN, 0);
+ if (!err)
+ err = er;
+ }
+ return err;
+}
+
+/**
+ * free_idx_lebs - free unused LEBs after commit end.
+ * @c: UBIFS file-system description object
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int free_idx_lebs(struct ubifs_info *c)
+{
+ int err;
+
+ err = free_unused_idx_lebs(c);
+ kfree(c->ilebs);
+ c->ilebs = NULL;
+ return err;
+}
+
+/**
+ * ubifs_tnc_start_commit - start TNC commit.
+ * @c: UBIFS file-system description object
+ * @zroot: new index root position is returned here
+ *
+ * This function prepares the list of indexing nodes to commit and lays out
+ * their positions on flash. If there is not enough free space it uses the
+ * in-gap commit method. Returns zero in case of success and a negative error
+ * code in case of failure.
+ */
+int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot)
+{
+ int err = 0, cnt;
+
+ mutex_lock(&c->tnc_mutex);
+ err = dbg_check_tnc(c, 1);
+ if (err)
+ goto out;
+ cnt = get_znodes_to_commit(c);
+ if (cnt != 0) {
+ int no_space = 0;
+
+ err = alloc_idx_lebs(c, cnt);
+ if (err == -ENOSPC)
+ no_space = 1;
+ else if (err)
+ goto out_free;
+ err = layout_commit(c, no_space, cnt);
+ if (err)
+ goto out_free;
+ ubifs_assert(atomic_long_read(&c->dirty_zn_cnt) == 0);
+ err = free_unused_idx_lebs(c);
+ if (err)
+ goto out;
+ }
+ destroy_old_idx(c);
+ memcpy(zroot, &c->zroot, sizeof(struct ubifs_zbranch));
+
+ err = ubifs_save_dirty_idx_lnums(c);
+ if (err)
+ goto out;
+
+ spin_lock(&c->space_lock);
+ /*
+ * Although we have not finished committing yet, update size of the
+ * committed index ('c->old_idx_sz') and zero out the index growth
+ * budget. It is OK to do this now, because we've reserved all the
+ * space which is needed to commit the index, and it is save for the
+ * budgeting subsystem to assume the index is already committed,
+ * even though it is not.
+ */
+ c->old_idx_sz = c->calc_idx_sz;
+ c->budg_uncommitted_idx = 0;
+ spin_unlock(&c->space_lock);
+ mutex_unlock(&c->tnc_mutex);
+
+ dbg_cmt("number of index LEBs %d", c->lst.idx_lebs);
+ dbg_cmt("size of index %llu", c->calc_idx_sz);
+ return err;
+
+out_free:
+ free_idx_lebs(c);
+out:
+ mutex_unlock(&c->tnc_mutex);
+ return err;
+}
+
+/**
+ * write_index - write index nodes.
+ * @c: UBIFS file-system description object
+ *
+ * This function writes the index nodes whose positions were laid out in the
+ * layout_in_empty_space function.
+ */
+static int write_index(struct ubifs_info *c)
+{
+ struct ubifs_idx_node *idx;
+ struct ubifs_znode *znode, *cnext;
+ int i, lnum, offs, len, next_len, buf_len, buf_offs, used;
+ int avail, wlen, err, lnum_pos = 0;
+
+ cnext = c->enext;
+ if (!cnext)
+ return 0;
+
+ /*
+ * Always write index nodes to the index head so that index nodes and
+ * other types of nodes are never mixed in the same erase block.
+ */
+ lnum = c->ihead_lnum;
+ buf_offs = c->ihead_offs;
+
+ /* Allocate commit buffer */
+ buf_len = ALIGN(c->max_idx_node_sz, c->min_io_size);
+ used = 0;
+ avail = buf_len;
+
+ /* Ensure there is enough room for first write */
+ next_len = ubifs_idx_node_sz(c, cnext->child_cnt);
+ if (buf_offs + next_len > c->leb_size) {
+ err = ubifs_update_one_lp(c, lnum, LPROPS_NC, 0, 0,
+ LPROPS_TAKEN);
+ if (err)
+ return err;
+ lnum = -1;
+ }
+
+ while (1) {
+ cond_resched();
+
+ znode = cnext;
+ idx = c->cbuf + used;
+
+ /* Make index node */
+ idx->ch.node_type = UBIFS_IDX_NODE;
+ idx->child_cnt = cpu_to_le16(znode->child_cnt);
+ idx->level = cpu_to_le16(znode->level);
+ for (i = 0; i < znode->child_cnt; i++) {
+ struct ubifs_branch *br = ubifs_idx_branch(c, idx, i);
+ struct ubifs_zbranch *zbr = &znode->zbranch[i];
+
+ key_write_idx(c, &zbr->key, &br->key);
+ br->lnum = cpu_to_le32(zbr->lnum);
+ br->offs = cpu_to_le32(zbr->offs);
+ br->len = cpu_to_le32(zbr->len);
+ if (!zbr->lnum || !zbr->len) {
+ ubifs_err("bad ref in znode");
+ dbg_dump_znode(c, znode);
+ if (zbr->znode)
+ dbg_dump_znode(c, zbr->znode);
+ }
+ }
+ len = ubifs_idx_node_sz(c, znode->child_cnt);
+ ubifs_prepare_node(c, idx, len, 0);
+
+ /* Determine the index node position */
+ if (lnum == -1) {
+ lnum = c->ilebs[lnum_pos++];
+ buf_offs = 0;
+ used = 0;
+ avail = buf_len;
+ }
+ offs = buf_offs + used;
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+ if (lnum != znode->lnum || offs != znode->offs ||
+ len != znode->len) {
+ ubifs_err("inconsistent znode posn");
+ return -EINVAL;
+ }
+#endif
+
+ /* Grab some stuff from znode while we still can */
+ cnext = znode->cnext;
+
+ ubifs_assert(ubifs_zn_dirty(znode));
+ ubifs_assert(test_bit(COW_ZNODE, &znode->flags));
+
+ /*
+ * It is important that other threads should see %DIRTY_ZNODE
+ * flag cleared before %COW_ZNODE. Specifically, it matters in
+ * the 'dirty_cow_znode()' function. This is the reason for the
+ * first barrier. Also, we want the bit changes to be seen to
+ * other threads ASAP, to avoid unnecesarry copying, which is
+ * the reason for the second barrier.
+ */
+ clear_bit(DIRTY_ZNODE, &znode->flags);
+ smp_mb__before_clear_bit();
+ clear_bit(COW_ZNODE, &znode->flags);
+ smp_mb__after_clear_bit();
+
+ /* Do not access znode from this point on */
+
+ /* Update buffer positions */
+ wlen = used + len;
+ used += ALIGN(len, 8);
+ avail -= ALIGN(len, 8);
+
+ /*
+ * Calculate the next index node length to see if there is
+ * enough room for it
+ */
+ if (cnext == c->cnext)
+ next_len = 0;
+ else
+ next_len = ubifs_idx_node_sz(c, cnext->child_cnt);
+
+ if (c->min_io_size == 1) {
+ /*
+ * Write the prepared index node immediately if there is
+ * no minimum IO size
+ */
+ err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs,
+ wlen, UBI_SHORTTERM);
+ if (err)
+ return err;
+ buf_offs += ALIGN(wlen, 8);
+ if (next_len) {
+ used = 0;
+ avail = buf_len;
+ if (buf_offs + next_len > c->leb_size) {
+ err = ubifs_update_one_lp(c, lnum,
+ LPROPS_NC, 0, 0, LPROPS_TAKEN);
+ if (err)
+ return err;
+ lnum = -1;
+ }
+ continue;
+ }
+ } else {
+ int blen, nxt_offs = buf_offs + used + next_len;
+
+ if (next_len && nxt_offs <= c->leb_size) {
+ if (avail > 0)
+ continue;
+ else
+ blen = buf_len;
+ } else {
+ wlen = ALIGN(wlen, 8);
+ blen = ALIGN(wlen, c->min_io_size);
+ ubifs_pad(c, c->cbuf + wlen, blen - wlen);
+ }
+ /*
+ * The buffer is full or there are no more znodes
+ * to do
+ */
+ err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs,
+ blen, UBI_SHORTTERM);
+ if (err)
+ return err;
+ buf_offs += blen;
+ if (next_len) {
+ if (nxt_offs > c->leb_size) {
+ err = ubifs_update_one_lp(c, lnum,
+ LPROPS_NC, 0, 0, LPROPS_TAKEN);
+ if (err)
+ return err;
+ lnum = -1;
+ }
+ used -= blen;
+ if (used < 0)
+ used = 0;
+ avail = buf_len - used;
+ memmove(c->cbuf, c->cbuf + blen, used);
+ continue;
+ }
+ }
+ break;
+ }
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+ if (lnum != c->new_ihead_lnum || buf_offs != c->new_ihead_offs) {
+ ubifs_err("inconsistent ihead");
+ return -EINVAL;
+ }
+#endif
+
+ c->ihead_lnum = lnum;
+ c->ihead_offs = buf_offs;
+
+ return 0;
+}
+
+/**
+ * free_obsolete_znodes - free obsolete znodes.
+ * @c: UBIFS file-system description object
+ *
+ * At the end of commit end, obsolete znodes are freed.
+ */
+static void free_obsolete_znodes(struct ubifs_info *c)
+{
+ struct ubifs_znode *znode, *cnext;
+
+ cnext = c->cnext;
+ do {
+ znode = cnext;
+ cnext = znode->cnext;
+ if (test_bit(OBSOLETE_ZNODE, &znode->flags))
+ kfree(znode);
+ else {
+ znode->cnext = NULL;
+ atomic_long_inc(&c->clean_zn_cnt);
+ atomic_long_inc(&ubifs_clean_zn_cnt);
+ }
+ } while (cnext != c->cnext);
+}
+
+/**
+ * return_gap_lebs - return LEBs used by the in-gap commit method.
+ * @c: UBIFS file-system description object
+ *
+ * This function clears the "taken" flag for the LEBs which were used by the
+ * "commit in-the-gaps" method.
+ */
+static int return_gap_lebs(struct ubifs_info *c)
+{
+ int *p, err;
+
+ if (!c->gap_lebs)
+ return 0;
+
+ dbg_cmt("");
+ for (p = c->gap_lebs; *p != -1; p++) {
+ err = ubifs_change_one_lp(c, *p, LPROPS_NC, LPROPS_NC, 0,
+ LPROPS_TAKEN, 0);
+ if (err)
+ return err;
+ }
+
+ kfree(c->gap_lebs);
+ c->gap_lebs = NULL;
+ return 0;
+}
+
+/**
+ * ubifs_tnc_end_commit - update the TNC for commit end.
+ * @c: UBIFS file-system description object
+ *
+ * Write the dirty znodes.
+ */
+int ubifs_tnc_end_commit(struct ubifs_info *c)
+{
+ int err;
+
+ if (!c->cnext)
+ return 0;
+
+ err = return_gap_lebs(c);
+ if (err)
+ return err;
+
+ err = write_index(c);
+ if (err)
+ return err;
+
+ mutex_lock(&c->tnc_mutex);
+
+ dbg_cmt("TNC height is %d", c->zroot.znode->level + 1);
+
+ free_obsolete_znodes(c);
+
+ c->cnext = NULL;
+ kfree(c->ilebs);
+ c->ilebs = NULL;
+
+ mutex_unlock(&c->tnc_mutex);
+
+ return 0;
+}
diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c
new file mode 100644
index 00000000000..a25c1cc1f8d
--- /dev/null
+++ b/fs/ubifs/tnc_misc.c
@@ -0,0 +1,494 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Adrian Hunter
+ * Artem Bityutskiy (Битюцкий Артём)
+ */
+
+/*
+ * This file contains miscelanious TNC-related functions shared betweend
+ * different files. This file does not form any logically separate TNC
+ * sub-system. The file was created because there is a lot of TNC code and
+ * putting it all in one file would make that file too big and unreadable.
+ */
+
+#include "ubifs.h"
+
+/**
+ * ubifs_tnc_levelorder_next - next TNC tree element in levelorder traversal.
+ * @zr: root of the subtree to traverse
+ * @znode: previous znode
+ *
+ * This function implements levelorder TNC traversal. The LNC is ignored.
+ * Returns the next element or %NULL if @znode is already the last one.
+ */
+struct ubifs_znode *ubifs_tnc_levelorder_next(struct ubifs_znode *zr,
+ struct ubifs_znode *znode)
+{
+ int level, iip, level_search = 0;
+ struct ubifs_znode *zn;
+
+ ubifs_assert(zr);
+
+ if (unlikely(!znode))
+ return zr;
+
+ if (unlikely(znode == zr)) {
+ if (znode->level == 0)
+ return NULL;
+ return ubifs_tnc_find_child(zr, 0);
+ }
+
+ level = znode->level;
+
+ iip = znode->iip;
+ while (1) {
+ ubifs_assert(znode->level <= zr->level);
+
+ /*
+ * First walk up until there is a znode with next branch to
+ * look at.
+ */
+ while (znode->parent != zr && iip >= znode->parent->child_cnt) {
+ znode = znode->parent;
+ iip = znode->iip;
+ }
+
+ if (unlikely(znode->parent == zr &&
+ iip >= znode->parent->child_cnt)) {
+ /* This level is done, switch to the lower one */
+ level -= 1;
+ if (level_search || level < 0)
+ /*
+ * We were already looking for znode at lower
+ * level ('level_search'). As we are here
+ * again, it just does not exist. Or all levels
+ * were finished ('level < 0').
+ */
+ return NULL;
+
+ level_search = 1;
+ iip = -1;
+ znode = ubifs_tnc_find_child(zr, 0);
+ ubifs_assert(znode);
+ }
+
+ /* Switch to the next index */
+ zn = ubifs_tnc_find_child(znode->parent, iip + 1);
+ if (!zn) {
+ /* No more children to look at, we have walk up */
+ iip = znode->parent->child_cnt;
+ continue;
+ }
+
+ /* Walk back down to the level we came from ('level') */
+ while (zn->level != level) {
+ znode = zn;
+ zn = ubifs_tnc_find_child(zn, 0);
+ if (!zn) {
+ /*
+ * This path is not too deep so it does not
+ * reach 'level'. Try next path.
+ */
+ iip = znode->iip;
+ break;
+ }
+ }
+
+ if (zn) {
+ ubifs_assert(zn->level >= 0);
+ return zn;
+ }
+ }
+}
+
+/**
+ * ubifs_search_zbranch - search znode branch.
+ * @c: UBIFS file-system description object
+ * @znode: znode to search in
+ * @key: key to search for
+ * @n: znode branch slot number is returned here
+ *
+ * This is a helper function which search branch with key @key in @znode using
+ * binary search. The result of the search may be:
+ * o exact match, then %1 is returned, and the slot number of the branch is
+ * stored in @n;
+ * o no exact match, then %0 is returned and the slot number of the left
+ * closest branch is returned in @n; the slot if all keys in this znode are
+ * greater than @key, then %-1 is returned in @n.
+ */
+int ubifs_search_zbranch(const struct ubifs_info *c,
+ const struct ubifs_znode *znode,
+ const union ubifs_key *key, int *n)
+{
+ int beg = 0, end = znode->child_cnt, uninitialized_var(mid);
+ int uninitialized_var(cmp);
+ const struct ubifs_zbranch *zbr = &znode->zbranch[0];
+
+ ubifs_assert(end > beg);
+
+ while (end > beg) {
+ mid = (beg + end) >> 1;
+ cmp = keys_cmp(c, key, &zbr[mid].key);
+ if (cmp > 0)
+ beg = mid + 1;
+ else if (cmp < 0)
+ end = mid;
+ else {
+ *n = mid;
+ return 1;
+ }
+ }
+
+ *n = end - 1;
+
+ /* The insert point is after *n */
+ ubifs_assert(*n >= -1 && *n < znode->child_cnt);
+ if (*n == -1)
+ ubifs_assert(keys_cmp(c, key, &zbr[0].key) < 0);
+ else
+ ubifs_assert(keys_cmp(c, key, &zbr[*n].key) > 0);
+ if (*n + 1 < znode->child_cnt)
+ ubifs_assert(keys_cmp(c, key, &zbr[*n + 1].key) < 0);
+
+ return 0;
+}
+
+/**
+ * ubifs_tnc_postorder_first - find first znode to do postorder tree traversal.
+ * @znode: znode to start at (root of the sub-tree to traverse)
+ *
+ * Find the lowest leftmost znode in a subtree of the TNC tree. The LNC is
+ * ignored.
+ */
+struct ubifs_znode *ubifs_tnc_postorder_first(struct ubifs_znode *znode)
+{
+ if (unlikely(!znode))
+ return NULL;
+
+ while (znode->level > 0) {
+ struct ubifs_znode *child;
+
+ child = ubifs_tnc_find_child(znode, 0);
+ if (!child)
+ return znode;
+ znode = child;
+ }
+
+ return znode;
+}
+
+/**
+ * ubifs_tnc_postorder_next - next TNC tree element in postorder traversal.
+ * @znode: previous znode
+ *
+ * This function implements postorder TNC traversal. The LNC is ignored.
+ * Returns the next element or %NULL if @znode is already the last one.
+ */
+struct ubifs_znode *ubifs_tnc_postorder_next(struct ubifs_znode *znode)
+{
+ struct ubifs_znode *zn;
+
+ ubifs_assert(znode);
+ if (unlikely(!znode->parent))
+ return NULL;
+
+ /* Switch to the next index in the parent */
+ zn = ubifs_tnc_find_child(znode->parent, znode->iip + 1);
+ if (!zn)
+ /* This is in fact the last child, return parent */
+ return znode->parent;
+
+ /* Go to the first znode in this new subtree */
+ return ubifs_tnc_postorder_first(zn);
+}
+
+/**
+ * ubifs_destroy_tnc_subtree - destroy all znodes connected to a subtree.
+ * @znode: znode defining subtree to destroy
+ *
+ * This function destroys subtree of the TNC tree. Returns number of clean
+ * znodes in the subtree.
+ */
+long ubifs_destroy_tnc_subtree(struct ubifs_znode *znode)
+{
+ struct ubifs_znode *zn = ubifs_tnc_postorder_first(znode);
+ long clean_freed = 0;
+ int n;
+
+ ubifs_assert(zn);
+ while (1) {
+ for (n = 0; n < zn->child_cnt; n++) {
+ if (!zn->zbranch[n].znode)
+ continue;
+
+ if (zn->level > 0 &&
+ !ubifs_zn_dirty(zn->zbranch[n].znode))
+ clean_freed += 1;
+
+ cond_resched();
+ kfree(zn->zbranch[n].znode);
+ }
+
+ if (zn == znode) {
+ if (!ubifs_zn_dirty(zn))
+ clean_freed += 1;
+ kfree(zn);
+ return clean_freed;
+ }
+
+ zn = ubifs_tnc_postorder_next(zn);
+ }
+}
+
+/**
+ * read_znode - read an indexing node from flash and fill znode.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB of the indexing node to read
+ * @offs: node offset
+ * @len: node length
+ * @znode: znode to read to
+ *
+ * This function reads an indexing node from the flash media and fills znode
+ * with the read data. Returns zero in case of success and a negative error
+ * code in case of failure. The read indexing node is validated and if anything
+ * is wrong with it, this function prints complaint messages and returns
+ * %-EINVAL.
+ */
+static int read_znode(struct ubifs_info *c, int lnum, int offs, int len,
+ struct ubifs_znode *znode)
+{
+ int i, err, type, cmp;
+ struct ubifs_idx_node *idx;
+
+ idx = kmalloc(c->max_idx_node_sz, GFP_NOFS);
+ if (!idx)
+ return -ENOMEM;
+
+ err = ubifs_read_node(c, idx, UBIFS_IDX_NODE, len, lnum, offs);
+ if (err < 0) {
+ kfree(idx);
+ return err;
+ }
+
+ znode->child_cnt = le16_to_cpu(idx->child_cnt);
+ znode->level = le16_to_cpu(idx->level);
+
+ dbg_tnc("LEB %d:%d, level %d, %d branch",
+ lnum, offs, znode->level, znode->child_cnt);
+
+ if (znode->child_cnt > c->fanout || znode->level > UBIFS_MAX_LEVELS) {
+ dbg_err("current fanout %d, branch count %d",
+ c->fanout, znode->child_cnt);
+ dbg_err("max levels %d, znode level %d",
+ UBIFS_MAX_LEVELS, znode->level);
+ err = 1;
+ goto out_dump;
+ }
+
+ for (i = 0; i < znode->child_cnt; i++) {
+ const struct ubifs_branch *br = ubifs_idx_branch(c, idx, i);
+ struct ubifs_zbranch *zbr = &znode->zbranch[i];
+
+ key_read(c, &br->key, &zbr->key);
+ zbr->lnum = le32_to_cpu(br->lnum);
+ zbr->offs = le32_to_cpu(br->offs);
+ zbr->len = le32_to_cpu(br->len);
+ zbr->znode = NULL;
+
+ /* Validate branch */
+
+ if (zbr->lnum < c->main_first ||
+ zbr->lnum >= c->leb_cnt || zbr->offs < 0 ||
+ zbr->offs + zbr->len > c->leb_size || zbr->offs & 7) {
+ dbg_err("bad branch %d", i);
+ err = 2;
+ goto out_dump;
+ }
+
+ switch (key_type(c, &zbr->key)) {
+ case UBIFS_INO_KEY:
+ case UBIFS_DATA_KEY:
+ case UBIFS_DENT_KEY:
+ case UBIFS_XENT_KEY:
+ break;
+ default:
+ dbg_msg("bad key type at slot %d: %s", i,
+ DBGKEY(&zbr->key));
+ err = 3;
+ goto out_dump;
+ }
+
+ if (znode->level)
+ continue;
+
+ type = key_type(c, &zbr->key);
+ if (c->ranges[type].max_len == 0) {
+ if (zbr->len != c->ranges[type].len) {
+ dbg_err("bad target node (type %d) length (%d)",
+ type, zbr->len);
+ dbg_err("have to be %d", c->ranges[type].len);
+ err = 4;
+ goto out_dump;
+ }
+ } else if (zbr->len < c->ranges[type].min_len ||
+ zbr->len > c->ranges[type].max_len) {
+ dbg_err("bad target node (type %d) length (%d)",
+ type, zbr->len);
+ dbg_err("have to be in range of %d-%d",
+ c->ranges[type].min_len,
+ c->ranges[type].max_len);
+ err = 5;
+ goto out_dump;
+ }
+ }
+
+ /*
+ * Ensure that the next key is greater or equivalent to the
+ * previous one.
+ */
+ for (i = 0; i < znode->child_cnt - 1; i++) {
+ const union ubifs_key *key1, *key2;
+
+ key1 = &znode->zbranch[i].key;
+ key2 = &znode->zbranch[i + 1].key;
+
+ cmp = keys_cmp(c, key1, key2);
+ if (cmp > 0) {
+ dbg_err("bad key order (keys %d and %d)", i, i + 1);
+ err = 6;
+ goto out_dump;
+ } else if (cmp == 0 && !is_hash_key(c, key1)) {
+ /* These can only be keys with colliding hash */
+ dbg_err("keys %d and %d are not hashed but equivalent",
+ i, i + 1);
+ err = 7;
+ goto out_dump;
+ }
+ }
+
+ kfree(idx);
+ return 0;
+
+out_dump:
+ ubifs_err("bad indexing node at LEB %d:%d, error %d", lnum, offs, err);
+ dbg_dump_node(c, idx);
+ kfree(idx);
+ return -EINVAL;
+}
+
+/**
+ * ubifs_load_znode - load znode to TNC cache.
+ * @c: UBIFS file-system description object
+ * @zbr: znode branch
+ * @parent: znode's parent
+ * @iip: index in parent
+ *
+ * This function loads znode pointed to by @zbr into the TNC cache and
+ * returns pointer to it in case of success and a negative error code in case
+ * of failure.
+ */
+struct ubifs_znode *ubifs_load_znode(struct ubifs_info *c,
+ struct ubifs_zbranch *zbr,
+ struct ubifs_znode *parent, int iip)
+{
+ int err;
+ struct ubifs_znode *znode;
+
+ ubifs_assert(!zbr->znode);
+ /*
+ * A slab cache is not presently used for znodes because the znode size
+ * depends on the fanout which is stored in the superblock.
+ */
+ znode = kzalloc(c->max_znode_sz, GFP_NOFS);
+ if (!znode)
+ return ERR_PTR(-ENOMEM);
+
+ err = read_znode(c, zbr->lnum, zbr->offs, zbr->len, znode);
+ if (err)
+ goto out;
+
+ atomic_long_inc(&c->clean_zn_cnt);
+
+ /*
+ * Increment the global clean znode counter as well. It is OK that
+ * global and per-FS clean znode counters may be inconsistent for some
+ * short time (because we might be preempted at this point), the global
+ * one is only used in shrinker.
+ */
+ atomic_long_inc(&ubifs_clean_zn_cnt);
+
+ zbr->znode = znode;
+ znode->parent = parent;
+ znode->time = get_seconds();
+ znode->iip = iip;
+
+ return znode;
+
+out:
+ kfree(znode);
+ return ERR_PTR(err);
+}
+
+/**
+ * ubifs_tnc_read_node - read a leaf node from the flash media.
+ * @c: UBIFS file-system description object
+ * @zbr: key and position of the node
+ * @node: node is returned here
+ *
+ * This function reads a node defined by @zbr from the flash media. Returns
+ * zero in case of success or a negative negative error code in case of
+ * failure.
+ */
+int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+ void *node)
+{
+ union ubifs_key key1, *key = &zbr->key;
+ int err, type = key_type(c, key);
+ struct ubifs_wbuf *wbuf;
+
+ /*
+ * 'zbr' has to point to on-flash node. The node may sit in a bud and
+ * may even be in a write buffer, so we have to take care about this.
+ */
+ wbuf = ubifs_get_wbuf(c, zbr->lnum);
+ if (wbuf)
+ err = ubifs_read_node_wbuf(wbuf, node, type, zbr->len,
+ zbr->lnum, zbr->offs);
+ else
+ err = ubifs_read_node(c, node, type, zbr->len, zbr->lnum,
+ zbr->offs);
+
+ if (err) {
+ dbg_tnc("key %s", DBGKEY(key));
+ return err;
+ }
+
+ /* Make sure the key of the read node is correct */
+ key_read(c, key, &key1);
+ if (memcmp(node + UBIFS_KEY_OFFSET, &key1, c->key_len)) {
+ ubifs_err("bad key in node at LEB %d:%d",
+ zbr->lnum, zbr->offs);
+ dbg_tnc("looked for key %s found node's key %s",
+ DBGKEY(key), DBGKEY1(&key1));
+ dbg_dump_node(c, node);
+ return -EINVAL;
+ }
+
+ return 0;
+}
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
new file mode 100644
index 00000000000..a9ecbd9af20
--- /dev/null
+++ b/fs/ubifs/ubifs-media.h
@@ -0,0 +1,745 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ * Adrian Hunter
+ */
+
+/*
+ * This file describes UBIFS on-flash format and contains definitions of all the
+ * relevant data structures and constants.
+ *
+ * All UBIFS on-flash objects are stored in the form of nodes. All nodes start
+ * with the UBIFS node magic number and have the same common header. Nodes
+ * always sit at 8-byte aligned positions on the media and node header sizes are
+ * also 8-byte aligned (except for the indexing node and the padding node).
+ */
+
+#ifndef __UBIFS_MEDIA_H__
+#define __UBIFS_MEDIA_H__
+
+/* UBIFS node magic number (must not have the padding byte first or last) */
+#define UBIFS_NODE_MAGIC 0x06101831
+
+/* UBIFS on-flash format version */
+#define UBIFS_FORMAT_VERSION 4
+
+/* Minimum logical eraseblock size in bytes */
+#define UBIFS_MIN_LEB_SZ (15*1024)
+
+/* Initial CRC32 value used when calculating CRC checksums */
+#define UBIFS_CRC32_INIT 0xFFFFFFFFU
+
+/*
+ * UBIFS does not try to compress data if its length is less than the below
+ * constant.
+ */
+#define UBIFS_MIN_COMPR_LEN 128
+
+/* Root inode number */
+#define UBIFS_ROOT_INO 1
+
+/* Lowest inode number used for regular inodes (not UBIFS-only internal ones) */
+#define UBIFS_FIRST_INO 64
+
+/*
+ * Maximum file name and extended attribute length (must be a multiple of 8,
+ * minus 1).
+ */
+#define UBIFS_MAX_NLEN 255
+
+/* Maximum number of data journal heads */
+#define UBIFS_MAX_JHEADS 1
+
+/*
+ * Size of UBIFS data block. Note, UBIFS is not a block oriented file-system,
+ * which means that it does not treat the underlying media as consisting of
+ * blocks like in case of hard drives. Do not be confused. UBIFS block is just
+ * the maximum amount of data which one data node can have or which can be
+ * attached to an inode node.
+ */
+#define UBIFS_BLOCK_SIZE 4096
+#define UBIFS_BLOCK_SHIFT 12
+#define UBIFS_BLOCK_MASK 0x00000FFF
+
+/* UBIFS padding byte pattern (must not be first or last byte of node magic) */
+#define UBIFS_PADDING_BYTE 0xCE
+
+/* Maximum possible key length */
+#define UBIFS_MAX_KEY_LEN 16
+
+/* Key length ("simple" format) */
+#define UBIFS_SK_LEN 8
+
+/* Minimum index tree fanout */
+#define UBIFS_MIN_FANOUT 3
+
+/* Maximum number of levels in UBIFS indexing B-tree */
+#define UBIFS_MAX_LEVELS 512
+
+/* Maximum amount of data attached to an inode in bytes */
+#define UBIFS_MAX_INO_DATA UBIFS_BLOCK_SIZE
+
+/* LEB Properties Tree fanout (must be power of 2) and fanout shift */
+#define UBIFS_LPT_FANOUT 4
+#define UBIFS_LPT_FANOUT_SHIFT 2
+
+/* LEB Properties Tree bit field sizes */
+#define UBIFS_LPT_CRC_BITS 16
+#define UBIFS_LPT_CRC_BYTES 2
+#define UBIFS_LPT_TYPE_BITS 4
+
+/* The key is always at the same position in all keyed nodes */
+#define UBIFS_KEY_OFFSET offsetof(struct ubifs_ino_node, key)
+
+/*
+ * LEB Properties Tree node types.
+ *
+ * UBIFS_LPT_PNODE: LPT leaf node (contains LEB properties)
+ * UBIFS_LPT_NNODE: LPT internal node
+ * UBIFS_LPT_LTAB: LPT's own lprops table
+ * UBIFS_LPT_LSAVE: LPT's save table (big model only)
+ * UBIFS_LPT_NODE_CNT: count of LPT node types
+ * UBIFS_LPT_NOT_A_NODE: all ones (15 for 4 bits) is never a valid node type
+ */
+enum {
+ UBIFS_LPT_PNODE,
+ UBIFS_LPT_NNODE,
+ UBIFS_LPT_LTAB,
+ UBIFS_LPT_LSAVE,
+ UBIFS_LPT_NODE_CNT,
+ UBIFS_LPT_NOT_A_NODE = (1 << UBIFS_LPT_TYPE_BITS) - 1,
+};
+
+/*
+ * UBIFS inode types.
+ *
+ * UBIFS_ITYPE_REG: regular file
+ * UBIFS_ITYPE_DIR: directory
+ * UBIFS_ITYPE_LNK: soft link
+ * UBIFS_ITYPE_BLK: block device node
+ * UBIFS_ITYPE_CHR: character device node
+ * UBIFS_ITYPE_FIFO: fifo
+ * UBIFS_ITYPE_SOCK: socket
+ * UBIFS_ITYPES_CNT: count of supported file types
+ */
+enum {
+ UBIFS_ITYPE_REG,
+ UBIFS_ITYPE_DIR,
+ UBIFS_ITYPE_LNK,
+ UBIFS_ITYPE_BLK,
+ UBIFS_ITYPE_CHR,
+ UBIFS_ITYPE_FIFO,
+ UBIFS_ITYPE_SOCK,
+ UBIFS_ITYPES_CNT,
+};
+
+/*
+ * Supported key hash functions.
+ *
+ * UBIFS_KEY_HASH_R5: R5 hash
+ * UBIFS_KEY_HASH_TEST: test hash which just returns first 4 bytes of the name
+ */
+enum {
+ UBIFS_KEY_HASH_R5,
+ UBIFS_KEY_HASH_TEST,
+};
+
+/*
+ * Supported key formats.
+ *
+ * UBIFS_SIMPLE_KEY_FMT: simple key format
+ */
+enum {
+ UBIFS_SIMPLE_KEY_FMT,
+};
+
+/*
+ * The simple key format uses 29 bits for storing UBIFS block number and hash
+ * value.
+ */
+#define UBIFS_S_KEY_BLOCK_BITS 29
+#define UBIFS_S_KEY_BLOCK_MASK 0x1FFFFFFF
+#define UBIFS_S_KEY_HASH_BITS UBIFS_S_KEY_BLOCK_BITS
+#define UBIFS_S_KEY_HASH_MASK UBIFS_S_KEY_BLOCK_MASK
+
+/*
+ * Key types.
+ *
+ * UBIFS_INO_KEY: inode node key
+ * UBIFS_DATA_KEY: data node key
+ * UBIFS_DENT_KEY: directory entry node key
+ * UBIFS_XENT_KEY: extended attribute entry key
+ * UBIFS_KEY_TYPES_CNT: number of supported key types
+ */
+enum {
+ UBIFS_INO_KEY,
+ UBIFS_DATA_KEY,
+ UBIFS_DENT_KEY,
+ UBIFS_XENT_KEY,
+ UBIFS_KEY_TYPES_CNT,
+};
+
+/* Count of LEBs reserved for the superblock area */
+#define UBIFS_SB_LEBS 1
+/* Count of LEBs reserved for the master area */
+#define UBIFS_MST_LEBS 2
+
+/* First LEB of the superblock area */
+#define UBIFS_SB_LNUM 0
+/* First LEB of the master area */
+#define UBIFS_MST_LNUM (UBIFS_SB_LNUM + UBIFS_SB_LEBS)
+/* First LEB of the log area */
+#define UBIFS_LOG_LNUM (UBIFS_MST_LNUM + UBIFS_MST_LEBS)
+
+/*
+ * The below constants define the absolute minimum values for various UBIFS
+ * media areas. Many of them actually depend of flash geometry and the FS
+ * configuration (number of journal heads, orphan LEBs, etc). This means that
+ * the smallest volume size which can be used for UBIFS cannot be pre-defined
+ * by these constants. The file-system that meets the below limitation will not
+ * necessarily mount. UBIFS does run-time calculations and validates the FS
+ * size.
+ */
+
+/* Minimum number of logical eraseblocks in the log */
+#define UBIFS_MIN_LOG_LEBS 2
+/* Minimum number of bud logical eraseblocks (one for each head) */
+#define UBIFS_MIN_BUD_LEBS 3
+/* Minimum number of journal logical eraseblocks */
+#define UBIFS_MIN_JNL_LEBS (UBIFS_MIN_LOG_LEBS + UBIFS_MIN_BUD_LEBS)
+/* Minimum number of LPT area logical eraseblocks */
+#define UBIFS_MIN_LPT_LEBS 2
+/* Minimum number of orphan area logical eraseblocks */
+#define UBIFS_MIN_ORPH_LEBS 1
+/*
+ * Minimum number of main area logical eraseblocks (buds, 3 for the index, 1
+ * for GC, 1 for deletions, and at least 1 for committed data).
+ */
+#define UBIFS_MIN_MAIN_LEBS (UBIFS_MIN_BUD_LEBS + 6)
+
+/* Minimum number of logical eraseblocks */
+#define UBIFS_MIN_LEB_CNT (UBIFS_SB_LEBS + UBIFS_MST_LEBS + \
+ UBIFS_MIN_LOG_LEBS + UBIFS_MIN_LPT_LEBS + \
+ UBIFS_MIN_ORPH_LEBS + UBIFS_MIN_MAIN_LEBS)
+
+/* Node sizes (N.B. these are guaranteed to be multiples of 8) */
+#define UBIFS_CH_SZ sizeof(struct ubifs_ch)
+#define UBIFS_INO_NODE_SZ sizeof(struct ubifs_ino_node)
+#define UBIFS_DATA_NODE_SZ sizeof(struct ubifs_data_node)
+#define UBIFS_DENT_NODE_SZ sizeof(struct ubifs_dent_node)
+#define UBIFS_TRUN_NODE_SZ sizeof(struct ubifs_trun_node)
+#define UBIFS_PAD_NODE_SZ sizeof(struct ubifs_pad_node)
+#define UBIFS_SB_NODE_SZ sizeof(struct ubifs_sb_node)
+#define UBIFS_MST_NODE_SZ sizeof(struct ubifs_mst_node)
+#define UBIFS_REF_NODE_SZ sizeof(struct ubifs_ref_node)
+#define UBIFS_IDX_NODE_SZ sizeof(struct ubifs_idx_node)
+#define UBIFS_CS_NODE_SZ sizeof(struct ubifs_cs_node)
+#define UBIFS_ORPH_NODE_SZ sizeof(struct ubifs_orph_node)
+/* Extended attribute entry nodes are identical to directory entry nodes */
+#define UBIFS_XENT_NODE_SZ UBIFS_DENT_NODE_SZ
+/* Only this does not have to be multiple of 8 bytes */
+#define UBIFS_BRANCH_SZ sizeof(struct ubifs_branch)
+
+/* Maximum node sizes (N.B. these are guaranteed to be multiples of 8) */
+#define UBIFS_MAX_DATA_NODE_SZ (UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE)
+#define UBIFS_MAX_INO_NODE_SZ (UBIFS_INO_NODE_SZ + UBIFS_MAX_INO_DATA)
+#define UBIFS_MAX_DENT_NODE_SZ (UBIFS_DENT_NODE_SZ + UBIFS_MAX_NLEN + 1)
+#define UBIFS_MAX_XENT_NODE_SZ UBIFS_MAX_DENT_NODE_SZ
+
+/* The largest UBIFS node */
+#define UBIFS_MAX_NODE_SZ UBIFS_MAX_INO_NODE_SZ
+
+/*
+ * On-flash inode flags.
+ *
+ * UBIFS_COMPR_FL: use compression for this inode
+ * UBIFS_SYNC_FL: I/O on this inode has to be synchronous
+ * UBIFS_IMMUTABLE_FL: inode is immutable
+ * UBIFS_APPEND_FL: writes to the inode may only append data
+ * UBIFS_DIRSYNC_FL: I/O on this directory inode has to be synchronous
+ * UBIFS_XATTR_FL: this inode is the inode for an extended attribute value
+ *
+ * Note, these are on-flash flags which correspond to ioctl flags
+ * (@FS_COMPR_FL, etc). They have the same values now, but generally, do not
+ * have to be the same.
+ */
+enum {
+ UBIFS_COMPR_FL = 0x01,
+ UBIFS_SYNC_FL = 0x02,
+ UBIFS_IMMUTABLE_FL = 0x04,
+ UBIFS_APPEND_FL = 0x08,
+ UBIFS_DIRSYNC_FL = 0x10,
+ UBIFS_XATTR_FL = 0x20,
+};
+
+/* Inode flag bits used by UBIFS */
+#define UBIFS_FL_MASK 0x0000001F
+
+/*
+ * UBIFS compression algorithms.
+ *
+ * UBIFS_COMPR_NONE: no compression
+ * UBIFS_COMPR_LZO: LZO compression
+ * UBIFS_COMPR_ZLIB: ZLIB compression
+ * UBIFS_COMPR_TYPES_CNT: count of supported compression types
+ */
+enum {
+ UBIFS_COMPR_NONE,
+ UBIFS_COMPR_LZO,
+ UBIFS_COMPR_ZLIB,
+ UBIFS_COMPR_TYPES_CNT,
+};
+
+/*
+ * UBIFS node types.
+ *
+ * UBIFS_INO_NODE: inode node
+ * UBIFS_DATA_NODE: data node
+ * UBIFS_DENT_NODE: directory entry node
+ * UBIFS_XENT_NODE: extended attribute node
+ * UBIFS_TRUN_NODE: truncation node
+ * UBIFS_PAD_NODE: padding node
+ * UBIFS_SB_NODE: superblock node
+ * UBIFS_MST_NODE: master node
+ * UBIFS_REF_NODE: LEB reference node
+ * UBIFS_IDX_NODE: index node
+ * UBIFS_CS_NODE: commit start node
+ * UBIFS_ORPH_NODE: orphan node
+ * UBIFS_NODE_TYPES_CNT: count of supported node types
+ *
+ * Note, we index arrays by these numbers, so keep them low and contiguous.
+ * Node type constants for inodes, direntries and so on have to be the same as
+ * corresponding key type constants.
+ */
+enum {
+ UBIFS_INO_NODE,
+ UBIFS_DATA_NODE,
+ UBIFS_DENT_NODE,
+ UBIFS_XENT_NODE,
+ UBIFS_TRUN_NODE,
+ UBIFS_PAD_NODE,
+ UBIFS_SB_NODE,
+ UBIFS_MST_NODE,
+ UBIFS_REF_NODE,
+ UBIFS_IDX_NODE,
+ UBIFS_CS_NODE,
+ UBIFS_ORPH_NODE,
+ UBIFS_NODE_TYPES_CNT,
+};
+
+/*
+ * Master node flags.
+ *
+ * UBIFS_MST_DIRTY: rebooted uncleanly - master node is dirty
+ * UBIFS_MST_NO_ORPHS: no orphan inodes present
+ * UBIFS_MST_RCVRY: written by recovery
+ */
+enum {
+ UBIFS_MST_DIRTY = 1,
+ UBIFS_MST_NO_ORPHS = 2,
+ UBIFS_MST_RCVRY = 4,
+};
+
+/*
+ * Node group type (used by recovery to recover whole group or none).
+ *
+ * UBIFS_NO_NODE_GROUP: this node is not part of a group
+ * UBIFS_IN_NODE_GROUP: this node is a part of a group
+ * UBIFS_LAST_OF_NODE_GROUP: this node is the last in a group
+ */
+enum {
+ UBIFS_NO_NODE_GROUP = 0,
+ UBIFS_IN_NODE_GROUP,
+ UBIFS_LAST_OF_NODE_GROUP,
+};
+
+/*
+ * Superblock flags.
+ *
+ * UBIFS_FLG_BIGLPT: if "big" LPT model is used if set
+ */
+enum {
+ UBIFS_FLG_BIGLPT = 0x02,
+};
+
+/**
+ * struct ubifs_ch - common header node.
+ * @magic: UBIFS node magic number (%UBIFS_NODE_MAGIC)
+ * @crc: CRC-32 checksum of the node header
+ * @sqnum: sequence number
+ * @len: full node length
+ * @node_type: node type
+ * @group_type: node group type
+ * @padding: reserved for future, zeroes
+ *
+ * Every UBIFS node starts with this common part. If the node has a key, the
+ * key always goes next.
+ */
+struct ubifs_ch {
+ __le32 magic;
+ __le32 crc;
+ __le64 sqnum;
+ __le32 len;
+ __u8 node_type;
+ __u8 group_type;
+ __u8 padding[2];
+} __attribute__ ((packed));
+
+/**
+ * union ubifs_dev_desc - device node descriptor.
+ * @new: new type device descriptor
+ * @huge: huge type device descriptor
+ *
+ * This data structure describes major/minor numbers of a device node. In an
+ * inode is a device node then its data contains an object of this type. UBIFS
+ * uses standard Linux "new" and "huge" device node encodings.
+ */
+union ubifs_dev_desc {
+ __le32 new;
+ __le64 huge;
+} __attribute__ ((packed));
+
+/**
+ * struct ubifs_ino_node - inode node.
+ * @ch: common header
+ * @key: node key
+ * @creat_sqnum: sequence number at time of creation
+ * @size: inode size in bytes (amount of uncompressed data)
+ * @atime_sec: access time seconds
+ * @ctime_sec: creation time seconds
+ * @mtime_sec: modification time seconds
+ * @atime_nsec: access time nanoseconds
+ * @ctime_nsec: creation time nanoseconds
+ * @mtime_nsec: modification time nanoseconds
+ * @nlink: number of hard links
+ * @uid: owner ID
+ * @gid: group ID
+ * @mode: access flags
+ * @flags: per-inode flags (%UBIFS_COMPR_FL, %UBIFS_SYNC_FL, etc)
+ * @data_len: inode data length
+ * @xattr_cnt: count of extended attributes this inode has
+ * @xattr_size: summarized size of all extended attributes in bytes
+ * @padding1: reserved for future, zeroes
+ * @xattr_names: sum of lengths of all extended attribute names belonging to
+ * this inode
+ * @compr_type: compression type used for this inode
+ * @padding2: reserved for future, zeroes
+ * @data: data attached to the inode
+ *
+ * Note, even though inode compression type is defined by @compr_type, some
+ * nodes of this inode may be compressed with different compressor - this
+ * happens if compression type is changed while the inode already has data
+ * nodes. But @compr_type will be use for further writes to the inode.
+ *
+ * Note, do not forget to amend 'zero_ino_node_unused()' function when changing
+ * the padding fields.
+ */
+struct ubifs_ino_node {
+ struct ubifs_ch ch;
+ __u8 key[UBIFS_MAX_KEY_LEN];
+ __le64 creat_sqnum;
+ __le64 size;
+ __le64 atime_sec;
+ __le64 ctime_sec;
+ __le64 mtime_sec;
+ __le32 atime_nsec;
+ __le32 ctime_nsec;
+ __le32 mtime_nsec;
+ __le32 nlink;
+ __le32 uid;
+ __le32 gid;
+ __le32 mode;
+ __le32 flags;
+ __le32 data_len;
+ __le32 xattr_cnt;
+ __le32 xattr_size;
+ __u8 padding1[4]; /* Watch 'zero_ino_node_unused()' if changing! */
+ __le32 xattr_names;
+ __le16 compr_type;
+ __u8 padding2[26]; /* Watch 'zero_ino_node_unused()' if changing! */
+ __u8 data[];
+} __attribute__ ((packed));
+
+/**
+ * struct ubifs_dent_node - directory entry node.
+ * @ch: common header
+ * @key: node key
+ * @inum: target inode number
+ * @padding1: reserved for future, zeroes
+ * @type: type of the target inode (%UBIFS_ITYPE_REG, %UBIFS_ITYPE_DIR, etc)
+ * @nlen: name length
+ * @padding2: reserved for future, zeroes
+ * @name: zero-terminated name
+ *
+ * Note, do not forget to amend 'zero_dent_node_unused()' function when
+ * changing the padding fields.
+ */
+struct ubifs_dent_node {
+ struct ubifs_ch ch;
+ __u8 key[UBIFS_MAX_KEY_LEN];
+ __le64 inum;
+ __u8 padding1;
+ __u8 type;
+ __le16 nlen;
+ __u8 padding2[4]; /* Watch 'zero_dent_node_unused()' if changing! */
+ __u8 name[];
+} __attribute__ ((packed));
+
+/**
+ * struct ubifs_data_node - data node.
+ * @ch: common header
+ * @key: node key
+ * @size: uncompressed data size in bytes
+ * @compr_type: compression type (%UBIFS_COMPR_NONE, %UBIFS_COMPR_LZO, etc)
+ * @padding: reserved for future, zeroes
+ * @data: data
+ *
+ * Note, do not forget to amend 'zero_data_node_unused()' function when
+ * changing the padding fields.
+ */
+struct ubifs_data_node {
+ struct ubifs_ch ch;
+ __u8 key[UBIFS_MAX_KEY_LEN];
+ __le32 size;
+ __le16 compr_type;
+ __u8 padding[2]; /* Watch 'zero_data_node_unused()' if changing! */
+ __u8 data[];
+} __attribute__ ((packed));
+
+/**
+ * struct ubifs_trun_node - truncation node.
+ * @ch: common header
+ * @inum: truncated inode number
+ * @padding: reserved for future, zeroes
+ * @old_size: size before truncation
+ * @new_size: size after truncation
+ *
+ * This node exists only in the journal and never goes to the main area. Note,
+ * do not forget to amend 'zero_trun_node_unused()' function when changing the
+ * padding fields.
+ */
+struct ubifs_trun_node {
+ struct ubifs_ch ch;
+ __le32 inum;
+ __u8 padding[12]; /* Watch 'zero_trun_node_unused()' if changing! */
+ __le64 old_size;
+ __le64 new_size;
+} __attribute__ ((packed));
+
+/**
+ * struct ubifs_pad_node - padding node.
+ * @ch: common header
+ * @pad_len: how many bytes after this node are unused (because padded)
+ * @padding: reserved for future, zeroes
+ */
+struct ubifs_pad_node {
+ struct ubifs_ch ch;
+ __le32 pad_len;
+} __attribute__ ((packed));
+
+/**
+ * struct ubifs_sb_node - superblock node.
+ * @ch: common header
+ * @padding: reserved for future, zeroes
+ * @key_hash: type of hash function used in keys
+ * @key_fmt: format of the key
+ * @flags: file-system flags (%UBIFS_FLG_BIGLPT, etc)
+ * @min_io_size: minimal input/output unit size
+ * @leb_size: logical eraseblock size in bytes
+ * @leb_cnt: count of LEBs used by file-system
+ * @max_leb_cnt: maximum count of LEBs used by file-system
+ * @max_bud_bytes: maximum amount of data stored in buds
+ * @log_lebs: log size in logical eraseblocks
+ * @lpt_lebs: number of LEBs used for lprops table
+ * @orph_lebs: number of LEBs used for recording orphans
+ * @jhead_cnt: count of journal heads
+ * @fanout: tree fanout (max. number of links per indexing node)
+ * @lsave_cnt: number of LEB numbers in LPT's save table
+ * @fmt_version: UBIFS on-flash format version
+ * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
+ * @padding1: reserved for future, zeroes
+ * @rp_uid: reserve pool UID
+ * @rp_gid: reserve pool GID
+ * @rp_size: size of the reserved pool in bytes
+ * @padding2: reserved for future, zeroes
+ * @time_gran: time granularity in nanoseconds
+ * @uuid: UUID generated when the file system image was created
+ */
+struct ubifs_sb_node {
+ struct ubifs_ch ch;
+ __u8 padding[2];
+ __u8 key_hash;
+ __u8 key_fmt;
+ __le32 flags;
+ __le32 min_io_size;
+ __le32 leb_size;
+ __le32 leb_cnt;
+ __le32 max_leb_cnt;
+ __le64 max_bud_bytes;
+ __le32 log_lebs;
+ __le32 lpt_lebs;
+ __le32 orph_lebs;
+ __le32 jhead_cnt;
+ __le32 fanout;
+ __le32 lsave_cnt;
+ __le32 fmt_version;
+ __le16 default_compr;
+ __u8 padding1[2];
+ __le32 rp_uid;
+ __le32 rp_gid;
+ __le64 rp_size;
+ __le32 time_gran;
+ __u8 uuid[16];
+ __u8 padding2[3972];
+} __attribute__ ((packed));
+
+/**
+ * struct ubifs_mst_node - master node.
+ * @ch: common header
+ * @highest_inum: highest inode number in the committed index
+ * @cmt_no: commit number
+ * @flags: various flags (%UBIFS_MST_DIRTY, etc)
+ * @log_lnum: start of the log
+ * @root_lnum: LEB number of the root indexing node
+ * @root_offs: offset within @root_lnum
+ * @root_len: root indexing node length
+ * @gc_lnum: LEB reserved for garbage collection (%-1 value means the LEB was
+ * not reserved and should be reserved on mount)
+ * @ihead_lnum: LEB number of index head
+ * @ihead_offs: offset of index head
+ * @index_size: size of index on flash
+ * @total_free: total free space in bytes
+ * @total_dirty: total dirty space in bytes
+ * @total_used: total used space in bytes (includes only data LEBs)
+ * @total_dead: total dead space in bytes (includes only data LEBs)
+ * @total_dark: total dark space in bytes (includes only data LEBs)
+ * @lpt_lnum: LEB number of LPT root nnode
+ * @lpt_offs: offset of LPT root nnode
+ * @nhead_lnum: LEB number of LPT head
+ * @nhead_offs: offset of LPT head
+ * @ltab_lnum: LEB number of LPT's own lprops table
+ * @ltab_offs: offset of LPT's own lprops table
+ * @lsave_lnum: LEB number of LPT's save table (big model only)
+ * @lsave_offs: offset of LPT's save table (big model only)
+ * @lscan_lnum: LEB number of last LPT scan
+ * @empty_lebs: number of empty logical eraseblocks
+ * @idx_lebs: number of indexing logical eraseblocks
+ * @leb_cnt: count of LEBs used by file-system
+ * @padding: reserved for future, zeroes
+ */
+struct ubifs_mst_node {
+ struct ubifs_ch ch;
+ __le64 highest_inum;
+ __le64 cmt_no;
+ __le32 flags;
+ __le32 log_lnum;
+ __le32 root_lnum;
+ __le32 root_offs;
+ __le32 root_len;
+ __le32 gc_lnum;
+ __le32 ihead_lnum;
+ __le32 ihead_offs;
+ __le64 index_size;
+ __le64 total_free;
+ __le64 total_dirty;
+ __le64 total_used;
+ __le64 total_dead;
+ __le64 total_dark;
+ __le32 lpt_lnum;
+ __le32 lpt_offs;
+ __le32 nhead_lnum;
+ __le32 nhead_offs;
+ __le32 ltab_lnum;
+ __le32 ltab_offs;
+ __le32 lsave_lnum;
+ __le32 lsave_offs;
+ __le32 lscan_lnum;
+ __le32 empty_lebs;
+ __le32 idx_lebs;
+ __le32 leb_cnt;
+ __u8 padding[344];
+} __attribute__ ((packed));
+
+/**
+ * struct ubifs_ref_node - logical eraseblock reference node.
+ * @ch: common header
+ * @lnum: the referred logical eraseblock number
+ * @offs: start offset in the referred LEB
+ * @jhead: journal head number
+ * @padding: reserved for future, zeroes
+ */
+struct ubifs_ref_node {
+ struct ubifs_ch ch;
+ __le32 lnum;
+ __le32 offs;
+ __le32 jhead;
+ __u8 padding[28];
+} __attribute__ ((packed));
+
+/**
+ * struct ubifs_branch - key/reference/length branch
+ * @lnum: LEB number of the target node
+ * @offs: offset within @lnum
+ * @len: target node length
+ * @key: key
+ */
+struct ubifs_branch {
+ __le32 lnum;
+ __le32 offs;
+ __le32 len;
+ __u8 key[];
+} __attribute__ ((packed));
+
+/**
+ * struct ubifs_idx_node - indexing node.
+ * @ch: common header
+ * @child_cnt: number of child index nodes
+ * @level: tree level
+ * @branches: LEB number / offset / length / key branches
+ */
+struct ubifs_idx_node {
+ struct ubifs_ch ch;
+ __le16 child_cnt;
+ __le16 level;
+ __u8 branches[];
+} __attribute__ ((packed));
+
+/**
+ * struct ubifs_cs_node - commit start node.
+ * @ch: common header
+ * @cmt_no: commit number
+ */
+struct ubifs_cs_node {
+ struct ubifs_ch ch;
+ __le64 cmt_no;
+} __attribute__ ((packed));
+
+/**
+ * struct ubifs_orph_node - orphan node.
+ * @ch: common header
+ * @cmt_no: commit number (also top bit is set on the last node of the commit)
+ * @inos: inode numbers of orphans
+ */
+struct ubifs_orph_node {
+ struct ubifs_ch ch;
+ __le64 cmt_no;
+ __le64 inos[];
+} __attribute__ ((packed));
+
+#endif /* __UBIFS_MEDIA_H__ */
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
new file mode 100644
index 00000000000..17c620b93ee
--- /dev/null
+++ b/fs/ubifs/ubifs.h
@@ -0,0 +1,1668 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ * Adrian Hunter
+ */
+
+#ifndef __UBIFS_H__
+#define __UBIFS_H__
+
+#include <asm/div64.h>
+#include <linux/statfs.h>
+#include <linux/fs.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
+#include <linux/mtd/ubi.h>
+#include <linux/pagemap.h>
+#include <linux/backing-dev.h>
+#include "ubifs-media.h"
+
+/* Version of this UBIFS implementation */
+#define UBIFS_VERSION 1
+
+/* Normal UBIFS messages */
+#define ubifs_msg(fmt, ...) \
+ printk(KERN_NOTICE "UBIFS: " fmt "\n", ##__VA_ARGS__)
+/* UBIFS error messages */
+#define ubifs_err(fmt, ...) \
+ printk(KERN_ERR "UBIFS error (pid %d): %s: " fmt "\n", current->pid, \
+ __func__, ##__VA_ARGS__)
+/* UBIFS warning messages */
+#define ubifs_warn(fmt, ...) \
+ printk(KERN_WARNING "UBIFS warning (pid %d): %s: " fmt "\n", \
+ current->pid, __func__, ##__VA_ARGS__)
+
+/* UBIFS file system VFS magic number */
+#define UBIFS_SUPER_MAGIC 0x24051905
+
+/* Number of UBIFS blocks per VFS page */
+#define UBIFS_BLOCKS_PER_PAGE (PAGE_CACHE_SIZE / UBIFS_BLOCK_SIZE)
+#define UBIFS_BLOCKS_PER_PAGE_SHIFT (PAGE_CACHE_SHIFT - UBIFS_BLOCK_SHIFT)
+
+/* "File system end of life" sequence number watermark */
+#define SQNUM_WARN_WATERMARK 0xFFFFFFFF00000000ULL
+#define SQNUM_WATERMARK 0xFFFFFFFFFF000000ULL
+
+/* Minimum amount of data UBIFS writes to the flash */
+#define MIN_WRITE_SZ (UBIFS_DATA_NODE_SZ + 8)
+
+/*
+ * Currently we do not support inode number overlapping and re-using, so this
+ * watermark defines dangerous inode number level. This should be fixed later,
+ * although it is difficult to exceed current limit. Another option is to use
+ * 64-bit inode numbers, but this means more overhead.
+ */
+#define INUM_WARN_WATERMARK 0xFFF00000
+#define INUM_WATERMARK 0xFFFFFF00
+
+/* Largest key size supported in this implementation */
+#define CUR_MAX_KEY_LEN UBIFS_SK_LEN
+
+/* Maximum number of entries in each LPT (LEB category) heap */
+#define LPT_HEAP_SZ 256
+
+/*
+ * Background thread name pattern. The numbers are UBI device and volume
+ * numbers.
+ */
+#define BGT_NAME_PATTERN "ubifs_bgt%d_%d"
+
+/* Default write-buffer synchronization timeout (5 secs) */
+#define DEFAULT_WBUF_TIMEOUT (5 * HZ)
+
+/* Maximum possible inode number (only 32-bit inodes are supported now) */
+#define MAX_INUM 0xFFFFFFFF
+
+/* Number of non-data journal heads */
+#define NONDATA_JHEADS_CNT 2
+
+/* Garbage collector head */
+#define GCHD 0
+/* Base journal head number */
+#define BASEHD 1
+/* First "general purpose" journal head */
+#define DATAHD 2
+
+/* 'No change' value for 'ubifs_change_lp()' */
+#define LPROPS_NC 0x80000001
+
+/*
+ * There is no notion of truncation key because truncation nodes do not exist
+ * in TNC. However, when replaying, it is handy to introduce fake "truncation"
+ * keys for truncation nodes because the code becomes simpler. So we define
+ * %UBIFS_TRUN_KEY type.
+ */
+#define UBIFS_TRUN_KEY UBIFS_KEY_TYPES_CNT
+
+/*
+ * How much a directory entry/extended attribute entry adds to the parent/host
+ * inode.
+ */
+#define CALC_DENT_SIZE(name_len) ALIGN(UBIFS_DENT_NODE_SZ + (name_len) + 1, 8)
+
+/* How much an extended attribute adds to the host inode */
+#define CALC_XATTR_BYTES(data_len) ALIGN(UBIFS_INO_NODE_SZ + (data_len) + 1, 8)
+
+/*
+ * Znodes which were not touched for 'OLD_ZNODE_AGE' seconds are considered
+ * "old", and znode which were touched last 'YOUNG_ZNODE_AGE' seconds ago are
+ * considered "young". This is used by shrinker when selecting znode to trim
+ * off.
+ */
+#define OLD_ZNODE_AGE 20
+#define YOUNG_ZNODE_AGE 5
+
+/*
+ * Some compressors, like LZO, may end up with more data then the input buffer.
+ * So UBIFS always allocates larger output buffer, to be sure the compressor
+ * will not corrupt memory in case of worst case compression.
+ */
+#define WORST_COMPR_FACTOR 2
+
+/* Maximum expected tree height for use by bottom_up_buf */
+#define BOTTOM_UP_HEIGHT 64
+
+/*
+ * Lockdep classes for UBIFS inode @ui_mutex.
+ */
+enum {
+ WB_MUTEX_1 = 0,
+ WB_MUTEX_2 = 1,
+ WB_MUTEX_3 = 2,
+};
+
+/*
+ * Znode flags (actually, bit numbers which store the flags).
+ *
+ * DIRTY_ZNODE: znode is dirty
+ * COW_ZNODE: znode is being committed and a new instance of this znode has to
+ * be created before changing this znode
+ * OBSOLETE_ZNODE: znode is obsolete, which means it was deleted, but it is
+ * still in the commit list and the ongoing commit operation
+ * will commit it, and delete this znode after it is done
+ */
+enum {
+ DIRTY_ZNODE = 0,
+ COW_ZNODE = 1,
+ OBSOLETE_ZNODE = 2,
+};
+
+/*
+ * Commit states.
+ *
+ * COMMIT_RESTING: commit is not wanted
+ * COMMIT_BACKGROUND: background commit has been requested
+ * COMMIT_REQUIRED: commit is required
+ * COMMIT_RUNNING_BACKGROUND: background commit is running
+ * COMMIT_RUNNING_REQUIRED: commit is running and it is required
+ * COMMIT_BROKEN: commit failed
+ */
+enum {
+ COMMIT_RESTING = 0,
+ COMMIT_BACKGROUND,
+ COMMIT_REQUIRED,
+ COMMIT_RUNNING_BACKGROUND,
+ COMMIT_RUNNING_REQUIRED,
+ COMMIT_BROKEN,
+};
+
+/*
+ * 'ubifs_scan_a_node()' return values.
+ *
+ * SCANNED_GARBAGE: scanned garbage
+ * SCANNED_EMPTY_SPACE: scanned empty space
+ * SCANNED_A_NODE: scanned a valid node
+ * SCANNED_A_CORRUPT_NODE: scanned a corrupted node
+ * SCANNED_A_BAD_PAD_NODE: scanned a padding node with invalid pad length
+ *
+ * Greater than zero means: 'scanned that number of padding bytes'
+ */
+enum {
+ SCANNED_GARBAGE = 0,
+ SCANNED_EMPTY_SPACE = -1,
+ SCANNED_A_NODE = -2,
+ SCANNED_A_CORRUPT_NODE = -3,
+ SCANNED_A_BAD_PAD_NODE = -4,
+};
+
+/*
+ * LPT cnode flag bits.
+ *
+ * DIRTY_CNODE: cnode is dirty
+ * COW_CNODE: cnode is being committed and must be copied before writing
+ * OBSOLETE_CNODE: cnode is being committed and has been copied (or deleted),
+ * so it can (and must) be freed when the commit is finished
+ */
+enum {
+ DIRTY_CNODE = 0,
+ COW_CNODE = 1,
+ OBSOLETE_CNODE = 2,
+};
+
+/*
+ * Dirty flag bits (lpt_drty_flgs) for LPT special nodes.
+ *
+ * LTAB_DIRTY: ltab node is dirty
+ * LSAVE_DIRTY: lsave node is dirty
+ */
+enum {
+ LTAB_DIRTY = 1,
+ LSAVE_DIRTY = 2,
+};
+
+/*
+ * Return codes used by the garbage collector.
+ * @LEB_FREED: the logical eraseblock was freed and is ready to use
+ * @LEB_FREED_IDX: indexing LEB was freed and can be used only after the commit
+ * @LEB_RETAINED: the logical eraseblock was freed and retained for GC purposes
+ */
+enum {
+ LEB_FREED,
+ LEB_FREED_IDX,
+ LEB_RETAINED,
+};
+
+/**
+ * struct ubifs_old_idx - index node obsoleted since last commit start.
+ * @rb: rb-tree node
+ * @lnum: LEB number of obsoleted index node
+ * @offs: offset of obsoleted index node
+ */
+struct ubifs_old_idx {
+ struct rb_node rb;
+ int lnum;
+ int offs;
+};
+
+/* The below union makes it easier to deal with keys */
+union ubifs_key {
+ uint8_t u8[CUR_MAX_KEY_LEN];
+ uint32_t u32[CUR_MAX_KEY_LEN/4];
+ uint64_t u64[CUR_MAX_KEY_LEN/8];
+ __le32 j32[CUR_MAX_KEY_LEN/4];
+};
+
+/**
+ * struct ubifs_scan_node - UBIFS scanned node information.
+ * @list: list of scanned nodes
+ * @key: key of node scanned (if it has one)
+ * @sqnum: sequence number
+ * @type: type of node scanned
+ * @offs: offset with LEB of node scanned
+ * @len: length of node scanned
+ * @node: raw node
+ */
+struct ubifs_scan_node {
+ struct list_head list;
+ union ubifs_key key;
+ unsigned long long sqnum;
+ int type;
+ int offs;
+ int len;
+ void *node;
+};
+
+/**
+ * struct ubifs_scan_leb - UBIFS scanned LEB information.
+ * @lnum: logical eraseblock number
+ * @nodes_cnt: number of nodes scanned
+ * @nodes: list of struct ubifs_scan_node
+ * @endpt: end point (and therefore the start of empty space)
+ * @ecc: read returned -EBADMSG
+ * @buf: buffer containing entire LEB scanned
+ */
+struct ubifs_scan_leb {
+ int lnum;
+ int nodes_cnt;
+ struct list_head nodes;
+ int endpt;
+ int ecc;
+ void *buf;
+};
+
+/**
+ * struct ubifs_gced_idx_leb - garbage-collected indexing LEB.
+ * @list: list
+ * @lnum: LEB number
+ * @unmap: OK to unmap this LEB
+ *
+ * This data structure is used to temporary store garbage-collected indexing
+ * LEBs - they are not released immediately, but only after the next commit.
+ * This is needed to guarantee recoverability.
+ */
+struct ubifs_gced_idx_leb {
+ struct list_head list;
+ int lnum;
+ int unmap;
+};
+
+/**
+ * struct ubifs_inode - UBIFS in-memory inode description.
+ * @vfs_inode: VFS inode description object
+ * @creat_sqnum: sequence number at time of creation
+ * @del_cmtno: commit number corresponding to the time the inode was deleted,
+ * protected by @c->commit_sem;
+ * @xattr_size: summarized size of all extended attributes in bytes
+ * @xattr_cnt: count of extended attributes this inode has
+ * @xattr_names: sum of lengths of all extended attribute names belonging to
+ * this inode
+ * @dirty: non-zero if the inode is dirty
+ * @xattr: non-zero if this is an extended attribute inode
+ * @ui_mutex: serializes inode write-back with the rest of VFS operations,
+ * serializes "clean <-> dirty" state changes, protects @dirty,
+ * @ui_size, and @xattr_size
+ * @ui_lock: protects @synced_i_size
+ * @synced_i_size: synchronized size of inode, i.e. the value of inode size
+ * currently stored on the flash; used only for regular file
+ * inodes
+ * @ui_size: inode size used by UBIFS when writing to flash
+ * @flags: inode flags (@UBIFS_COMPR_FL, etc)
+ * @compr_type: default compression type used for this inode
+ * @data_len: length of the data attached to the inode
+ * @data: inode's data
+ *
+ * @ui_mutex exists for two main reasons. At first it prevents inodes from
+ * being written back while UBIFS changing them, being in the middle of an VFS
+ * operation. This way UBIFS makes sure the inode fields are consistent. For
+ * example, in 'ubifs_rename()' we change 3 inodes simultaneously, and
+ * write-back must not write any of them before we have finished.
+ *
+ * The second reason is budgeting - UBIFS has to budget all operations. If an
+ * operation is going to mark an inode dirty, it has to allocate budget for
+ * this. It cannot just mark it dirty because there is no guarantee there will
+ * be enough flash space to write the inode back later. This means UBIFS has
+ * to have full control over inode "clean <-> dirty" transitions (and pages
+ * actually). But unfortunately, VFS marks inodes dirty in many places, and it
+ * does not ask the file-system if it is allowed to do so (there is a notifier,
+ * but it is not enough), i.e., there is no mechanism to synchronize with this.
+ * So UBIFS has its own inode dirty flag and its own mutex to serialize
+ * "clean <-> dirty" transitions.
+ *
+ * The @synced_i_size field is used to make sure we never write pages which are
+ * beyond last synchronized inode size. See 'ubifs_writepage()' for more
+ * information.
+ *
+ * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses
+ * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot
+ * make sure @inode->i_size is always changed under @ui_mutex, because it
+ * cannot call 'vmtruncate()' with @ui_mutex locked, because it would deadlock
+ * with 'ubifs_writepage()' (see file.c). All the other inode fields are
+ * changed under @ui_mutex, so they do not need "shadow" fields. Note, one
+ * could consider to rework locking and base it on "shadow" fields.
+ */
+struct ubifs_inode {
+ struct inode vfs_inode;
+ unsigned long long creat_sqnum;
+ unsigned long long del_cmtno;
+ unsigned int xattr_size;
+ unsigned int xattr_cnt;
+ unsigned int xattr_names;
+ unsigned int dirty:1;
+ unsigned int xattr:1;
+ struct mutex ui_mutex;
+ spinlock_t ui_lock;
+ loff_t synced_i_size;
+ loff_t ui_size;
+ int flags;
+ int compr_type;
+ int data_len;
+ void *data;
+};
+
+/**
+ * struct ubifs_unclean_leb - records a LEB recovered under read-only mode.
+ * @list: list
+ * @lnum: LEB number of recovered LEB
+ * @endpt: offset where recovery ended
+ *
+ * This structure records a LEB identified during recovery that needs to be
+ * cleaned but was not because UBIFS was mounted read-only. The information
+ * is used to clean the LEB when remounting to read-write mode.
+ */
+struct ubifs_unclean_leb {
+ struct list_head list;
+ int lnum;
+ int endpt;
+};
+
+/*
+ * LEB properties flags.
+ *
+ * LPROPS_UNCAT: not categorized
+ * LPROPS_DIRTY: dirty > 0, not index
+ * LPROPS_DIRTY_IDX: dirty + free > UBIFS_CH_SZ and index
+ * LPROPS_FREE: free > 0, not empty, not index
+ * LPROPS_HEAP_CNT: number of heaps used for storing categorized LEBs
+ * LPROPS_EMPTY: LEB is empty, not taken
+ * LPROPS_FREEABLE: free + dirty == leb_size, not index, not taken
+ * LPROPS_FRDI_IDX: free + dirty == leb_size and index, may be taken
+ * LPROPS_CAT_MASK: mask for the LEB categories above
+ * LPROPS_TAKEN: LEB was taken (this flag is not saved on the media)
+ * LPROPS_INDEX: LEB contains indexing nodes (this flag also exists on flash)
+ */
+enum {
+ LPROPS_UNCAT = 0,
+ LPROPS_DIRTY = 1,
+ LPROPS_DIRTY_IDX = 2,
+ LPROPS_FREE = 3,
+ LPROPS_HEAP_CNT = 3,
+ LPROPS_EMPTY = 4,
+ LPROPS_FREEABLE = 5,
+ LPROPS_FRDI_IDX = 6,
+ LPROPS_CAT_MASK = 15,
+ LPROPS_TAKEN = 16,
+ LPROPS_INDEX = 32,
+};
+
+/**
+ * struct ubifs_lprops - logical eraseblock properties.
+ * @free: amount of free space in bytes
+ * @dirty: amount of dirty space in bytes
+ * @flags: LEB properties flags (see above)
+ * @lnum: LEB number
+ * @list: list of same-category lprops (for LPROPS_EMPTY and LPROPS_FREEABLE)
+ * @hpos: heap position in heap of same-category lprops (other categories)
+ */
+struct ubifs_lprops {
+ int free;
+ int dirty;
+ int flags;
+ int lnum;
+ union {
+ struct list_head list;
+ int hpos;
+ };
+};
+
+/**
+ * struct ubifs_lpt_lprops - LPT logical eraseblock properties.
+ * @free: amount of free space in bytes
+ * @dirty: amount of dirty space in bytes
+ * @tgc: trivial GC flag (1 => unmap after commit end)
+ * @cmt: commit flag (1 => reserved for commit)
+ */
+struct ubifs_lpt_lprops {
+ int free;
+ int dirty;
+ unsigned tgc : 1;
+ unsigned cmt : 1;
+};
+
+/**
+ * struct ubifs_lp_stats - statistics of eraseblocks in the main area.
+ * @empty_lebs: number of empty LEBs
+ * @taken_empty_lebs: number of taken LEBs
+ * @idx_lebs: number of indexing LEBs
+ * @total_free: total free space in bytes
+ * @total_dirty: total dirty space in bytes
+ * @total_used: total used space in bytes (includes only data LEBs)
+ * @total_dead: total dead space in bytes (includes only data LEBs)
+ * @total_dark: total dark space in bytes (includes only data LEBs)
+ *
+ * N.B. total_dirty and total_used are different to other total_* fields,
+ * because they account _all_ LEBs, not just data LEBs.
+ *
+ * 'taken_empty_lebs' counts the LEBs that are in the transient state of having
+ * been 'taken' for use but not yet written to. 'taken_empty_lebs' is needed
+ * to account correctly for gc_lnum, otherwise 'empty_lebs' could be used
+ * by itself (in which case 'unused_lebs' would be a better name). In the case
+ * of gc_lnum, it is 'taken' at mount time or whenever a LEB is retained by GC,
+ * but unlike other empty LEBs that are 'taken', it may not be written straight
+ * away (i.e. before the next commit start or unmount), so either gc_lnum must
+ * be specially accounted for, or the current approach followed i.e. count it
+ * under 'taken_empty_lebs'.
+ */
+struct ubifs_lp_stats {
+ int empty_lebs;
+ int taken_empty_lebs;
+ int idx_lebs;
+ long long total_free;
+ long long total_dirty;
+ long long total_used;
+ long long total_dead;
+ long long total_dark;
+};
+
+struct ubifs_nnode;
+
+/**
+ * struct ubifs_cnode - LEB Properties Tree common node.
+ * @parent: parent nnode
+ * @cnext: next cnode to commit
+ * @flags: flags (%DIRTY_LPT_NODE or %OBSOLETE_LPT_NODE)
+ * @iip: index in parent
+ * @level: level in the tree (zero for pnodes, greater than zero for nnodes)
+ * @num: node number
+ */
+struct ubifs_cnode {
+ struct ubifs_nnode *parent;
+ struct ubifs_cnode *cnext;
+ unsigned long flags;
+ int iip;
+ int level;
+ int num;
+};
+
+/**
+ * struct ubifs_pnode - LEB Properties Tree leaf node.
+ * @parent: parent nnode
+ * @cnext: next cnode to commit
+ * @flags: flags (%DIRTY_LPT_NODE or %OBSOLETE_LPT_NODE)
+ * @iip: index in parent
+ * @level: level in the tree (always zero for pnodes)
+ * @num: node number
+ * @lprops: LEB properties array
+ */
+struct ubifs_pnode {
+ struct ubifs_nnode *parent;
+ struct ubifs_cnode *cnext;
+ unsigned long flags;
+ int iip;
+ int level;
+ int num;
+ struct ubifs_lprops lprops[UBIFS_LPT_FANOUT];
+};
+
+/**
+ * struct ubifs_nbranch - LEB Properties Tree internal node branch.
+ * @lnum: LEB number of child
+ * @offs: offset of child
+ * @nnode: nnode child
+ * @pnode: pnode child
+ * @cnode: cnode child
+ */
+struct ubifs_nbranch {
+ int lnum;
+ int offs;
+ union {
+ struct ubifs_nnode *nnode;
+ struct ubifs_pnode *pnode;
+ struct ubifs_cnode *cnode;
+ };
+};
+
+/**
+ * struct ubifs_nnode - LEB Properties Tree internal node.
+ * @parent: parent nnode
+ * @cnext: next cnode to commit
+ * @flags: flags (%DIRTY_LPT_NODE or %OBSOLETE_LPT_NODE)
+ * @iip: index in parent
+ * @level: level in the tree (always greater than zero for nnodes)
+ * @num: node number
+ * @nbranch: branches to child nodes
+ */
+struct ubifs_nnode {
+ struct ubifs_nnode *parent;
+ struct ubifs_cnode *cnext;
+ unsigned long flags;
+ int iip;
+ int level;
+ int num;
+ struct ubifs_nbranch nbranch[UBIFS_LPT_FANOUT];
+};
+
+/**
+ * struct ubifs_lpt_heap - heap of categorized lprops.
+ * @arr: heap array
+ * @cnt: number in heap
+ * @max_cnt: maximum number allowed in heap
+ *
+ * There are %LPROPS_HEAP_CNT heaps.
+ */
+struct ubifs_lpt_heap {
+ struct ubifs_lprops **arr;
+ int cnt;
+ int max_cnt;
+};
+
+/*
+ * Return codes for LPT scan callback function.
+ *
+ * LPT_SCAN_CONTINUE: continue scanning
+ * LPT_SCAN_ADD: add the LEB properties scanned to the tree in memory
+ * LPT_SCAN_STOP: stop scanning
+ */
+enum {
+ LPT_SCAN_CONTINUE = 0,
+ LPT_SCAN_ADD = 1,
+ LPT_SCAN_STOP = 2,
+};
+
+struct ubifs_info;
+
+/* Callback used by the 'ubifs_lpt_scan_nolock()' function */
+typedef int (*ubifs_lpt_scan_callback)(struct ubifs_info *c,
+ const struct ubifs_lprops *lprops,
+ int in_tree, void *data);
+
+/**
+ * struct ubifs_wbuf - UBIFS write-buffer.
+ * @c: UBIFS file-system description object
+ * @buf: write-buffer (of min. flash I/O unit size)
+ * @lnum: logical eraseblock number the write-buffer points to
+ * @offs: write-buffer offset in this logical eraseblock
+ * @avail: number of bytes available in the write-buffer
+ * @used: number of used bytes in the write-buffer
+ * @dtype: type of data stored in this LEB (%UBI_LONGTERM, %UBI_SHORTTERM,
+ * %UBI_UNKNOWN)
+ * @jhead: journal head the mutex belongs to (note, needed only to shut lockdep
+ * up by 'mutex_lock_nested()).
+ * @sync_callback: write-buffer synchronization callback
+ * @io_mutex: serializes write-buffer I/O
+ * @lock: serializes @buf, @lnum, @offs, @avail, @used, @next_ino and @inodes
+ * fields
+ * @timer: write-buffer timer
+ * @timeout: timer expire interval in jiffies
+ * @need_sync: it is set if its timer expired and needs sync
+ * @next_ino: points to the next position of the following inode number
+ * @inodes: stores the inode numbers of the nodes which are in wbuf
+ *
+ * The write-buffer synchronization callback is called when the write-buffer is
+ * synchronized in order to notify how much space was wasted due to
+ * write-buffer padding and how much free space is left in the LEB.
+ *
+ * Note: the fields @buf, @lnum, @offs, @avail and @used can be read under
+ * spin-lock or mutex because they are written under both mutex and spin-lock.
+ * @buf is appended to under mutex but overwritten under both mutex and
+ * spin-lock. Thus the data between @buf and @buf + @used can be read under
+ * spinlock.
+ */
+struct ubifs_wbuf {
+ struct ubifs_info *c;
+ void *buf;
+ int lnum;
+ int offs;
+ int avail;
+ int used;
+ int dtype;
+ int jhead;
+ int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad);
+ struct mutex io_mutex;
+ spinlock_t lock;
+ struct timer_list timer;
+ int timeout;
+ int need_sync;
+ int next_ino;
+ ino_t *inodes;
+};
+
+/**
+ * struct ubifs_bud - bud logical eraseblock.
+ * @lnum: logical eraseblock number
+ * @start: where the (uncommitted) bud data starts
+ * @jhead: journal head number this bud belongs to
+ * @list: link in the list buds belonging to the same journal head
+ * @rb: link in the tree of all buds
+ */
+struct ubifs_bud {
+ int lnum;
+ int start;
+ int jhead;
+ struct list_head list;
+ struct rb_node rb;
+};
+
+/**
+ * struct ubifs_jhead - journal head.
+ * @wbuf: head's write-buffer
+ * @buds_list: list of bud LEBs belonging to this journal head
+ *
+ * Note, the @buds list is protected by the @c->buds_lock.
+ */
+struct ubifs_jhead {
+ struct ubifs_wbuf wbuf;
+ struct list_head buds_list;
+};
+
+/**
+ * struct ubifs_zbranch - key/coordinate/length branch stored in znodes.
+ * @key: key
+ * @znode: znode address in memory
+ * @lnum: LEB number of the indexing node
+ * @offs: offset of the indexing node within @lnum
+ * @len: target node length
+ */
+struct ubifs_zbranch {
+ union ubifs_key key;
+ union {
+ struct ubifs_znode *znode;
+ void *leaf;
+ };
+ int lnum;
+ int offs;
+ int len;
+};
+
+/**
+ * struct ubifs_znode - in-memory representation of an indexing node.
+ * @parent: parent znode or NULL if it is the root
+ * @cnext: next znode to commit
+ * @flags: znode flags (%DIRTY_ZNODE, %COW_ZNODE or %OBSOLETE_ZNODE)
+ * @time: last access time (seconds)
+ * @level: level of the entry in the TNC tree
+ * @child_cnt: count of child znodes
+ * @iip: index in parent's zbranch array
+ * @alt: lower bound of key range has altered i.e. child inserted at slot 0
+ * @lnum: LEB number of the corresponding indexing node
+ * @offs: offset of the corresponding indexing node
+ * @len: length of the corresponding indexing node
+ * @zbranch: array of znode branches (@c->fanout elements)
+ */
+struct ubifs_znode {
+ struct ubifs_znode *parent;
+ struct ubifs_znode *cnext;
+ unsigned long flags;
+ unsigned long time;
+ int level;
+ int child_cnt;
+ int iip;
+ int alt;
+#ifdef CONFIG_UBIFS_FS_DEBUG
+ int lnum, offs, len;
+#endif
+ struct ubifs_zbranch zbranch[];
+};
+
+/**
+ * struct ubifs_node_range - node length range description data structure.
+ * @len: fixed node length
+ * @min_len: minimum possible node length
+ * @max_len: maximum possible node length
+ *
+ * If @max_len is %0, the node has fixed length @len.
+ */
+struct ubifs_node_range {
+ union {
+ int len;
+ int min_len;
+ };
+ int max_len;
+};
+
+/**
+ * struct ubifs_compressor - UBIFS compressor description structure.
+ * @compr_type: compressor type (%UBIFS_COMPR_LZO, etc)
+ * @cc: cryptoapi compressor handle
+ * @comp_mutex: mutex used during compression
+ * @decomp_mutex: mutex used during decompression
+ * @name: compressor name
+ * @capi_name: cryptoapi compressor name
+ */
+struct ubifs_compressor {
+ int compr_type;
+ struct crypto_comp *cc;
+ struct mutex *comp_mutex;
+ struct mutex *decomp_mutex;
+ const char *name;
+ const char *capi_name;
+};
+
+/**
+ * struct ubifs_budget_req - budget requirements of an operation.
+ *
+ * @fast: non-zero if the budgeting should try to acquire budget quickly and
+ * should not try to call write-back
+ * @recalculate: non-zero if @idx_growth, @data_growth, and @dd_growth fields
+ * have to be re-calculated
+ * @new_page: non-zero if the operation adds a new page
+ * @dirtied_page: non-zero if the operation makes a page dirty
+ * @new_dent: non-zero if the operation adds a new directory entry
+ * @mod_dent: non-zero if the operation removes or modifies an existing
+ * directory entry
+ * @new_ino: non-zero if the operation adds a new inode
+ * @new_ino_d: now much data newly created inode contains
+ * @dirtied_ino: how many inodes the operation makes dirty
+ * @dirtied_ino_d: now much data dirtied inode contains
+ * @idx_growth: how much the index will supposedly grow
+ * @data_growth: how much new data the operation will supposedly add
+ * @dd_growth: how much data that makes other data dirty the operation will
+ * supposedly add
+ *
+ * @idx_growth, @data_growth and @dd_growth are not used in budget request. The
+ * budgeting subsystem caches index and data growth values there to avoid
+ * re-calculating them when the budget is released. However, if @idx_growth is
+ * %-1, it is calculated by the release function using other fields.
+ *
+ * An inode may contain 4KiB of data at max., thus the widths of @new_ino_d
+ * is 13 bits, and @dirtied_ino_d - 15, because up to 4 inodes may be made
+ * dirty by the re-name operation.
+ *
+ * Note, UBIFS aligns node lengths to 8-bytes boundary, so the requester has to
+ * make sure the amount of inode data which contribute to @new_ino_d and
+ * @dirtied_ino_d fields are aligned.
+ */
+struct ubifs_budget_req {
+ unsigned int fast:1;
+ unsigned int recalculate:1;
+#ifndef UBIFS_DEBUG
+ unsigned int new_page:1;
+ unsigned int dirtied_page:1;
+ unsigned int new_dent:1;
+ unsigned int mod_dent:1;
+ unsigned int new_ino:1;
+ unsigned int new_ino_d:13;
+ unsigned int dirtied_ino:4;
+ unsigned int dirtied_ino_d:15;
+#else
+ /* Not bit-fields to check for overflows */
+ unsigned int new_page;
+ unsigned int dirtied_page;
+ unsigned int new_dent;
+ unsigned int mod_dent;
+ unsigned int new_ino;
+ unsigned int new_ino_d;
+ unsigned int dirtied_ino;
+ unsigned int dirtied_ino_d;
+#endif
+ int idx_growth;
+ int data_growth;
+ int dd_growth;
+};
+
+/**
+ * struct ubifs_orphan - stores the inode number of an orphan.
+ * @rb: rb-tree node of rb-tree of orphans sorted by inode number
+ * @list: list head of list of orphans in order added
+ * @new_list: list head of list of orphans added since the last commit
+ * @cnext: next orphan to commit
+ * @dnext: next orphan to delete
+ * @inum: inode number
+ * @new: %1 => added since the last commit, otherwise %0
+ */
+struct ubifs_orphan {
+ struct rb_node rb;
+ struct list_head list;
+ struct list_head new_list;
+ struct ubifs_orphan *cnext;
+ struct ubifs_orphan *dnext;
+ ino_t inum;
+ int new;
+};
+
+/**
+ * struct ubifs_mount_opts - UBIFS-specific mount options information.
+ * @unmount_mode: selected unmount mode (%0 default, %1 normal, %2 fast)
+ */
+struct ubifs_mount_opts {
+ unsigned int unmount_mode:2;
+};
+
+/**
+ * struct ubifs_info - UBIFS file-system description data structure
+ * (per-superblock).
+ * @vfs_sb: VFS @struct super_block object
+ * @bdi: backing device info object to make VFS happy and disable read-ahead
+ *
+ * @highest_inum: highest used inode number
+ * @max_sqnum: current global sequence number
+ * @cmt_no: commit number of the last successfully completed commit, protected
+ * by @commit_sem
+ * @cnt_lock: protects @highest_inum and @max_sqnum counters
+ * @fmt_version: UBIFS on-flash format version
+ * @uuid: UUID from super block
+ *
+ * @lhead_lnum: log head logical eraseblock number
+ * @lhead_offs: log head offset
+ * @ltail_lnum: log tail logical eraseblock number (offset is always 0)
+ * @log_mutex: protects the log, @lhead_lnum, @lhead_offs, @ltail_lnum, and
+ * @bud_bytes
+ * @min_log_bytes: minimum required number of bytes in the log
+ * @cmt_bud_bytes: used during commit to temporarily amount of bytes in
+ * committed buds
+ *
+ * @buds: tree of all buds indexed by bud LEB number
+ * @bud_bytes: how many bytes of flash is used by buds
+ * @buds_lock: protects the @buds tree, @bud_bytes, and per-journal head bud
+ * lists
+ * @jhead_cnt: count of journal heads
+ * @jheads: journal heads (head zero is base head)
+ * @max_bud_bytes: maximum number of bytes allowed in buds
+ * @bg_bud_bytes: number of bud bytes when background commit is initiated
+ * @old_buds: buds to be released after commit ends
+ * @max_bud_cnt: maximum number of buds
+ *
+ * @commit_sem: synchronizes committer with other processes
+ * @cmt_state: commit state
+ * @cs_lock: commit state lock
+ * @cmt_wq: wait queue to sleep on if the log is full and a commit is running
+ * @fast_unmount: do not run journal commit before un-mounting
+ * @big_lpt: flag that LPT is too big to write whole during commit
+ * @check_lpt_free: flag that indicates LPT GC may be needed
+ * @nospace: non-zero if the file-system does not have flash space (used as
+ * optimization)
+ * @nospace_rp: the same as @nospace, but additionally means that even reserved
+ * pool is full
+ *
+ * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and
+ * @calc_idx_sz
+ * @zroot: zbranch which points to the root index node and znode
+ * @cnext: next znode to commit
+ * @enext: next znode to commit to empty space
+ * @gap_lebs: array of LEBs used by the in-gaps commit method
+ * @cbuf: commit buffer
+ * @ileb_buf: buffer for commit in-the-gaps method
+ * @ileb_len: length of data in ileb_buf
+ * @ihead_lnum: LEB number of index head
+ * @ihead_offs: offset of index head
+ * @ilebs: pre-allocated index LEBs
+ * @ileb_cnt: number of pre-allocated index LEBs
+ * @ileb_nxt: next pre-allocated index LEBs
+ * @old_idx: tree of index nodes obsoleted since the last commit start
+ * @bottom_up_buf: a buffer which is used by 'dirty_cow_bottom_up()' in tnc.c
+ * @new_ihead_lnum: used by debugging to check ihead_lnum
+ * @new_ihead_offs: used by debugging to check ihead_offs
+ *
+ * @mst_node: master node
+ * @mst_offs: offset of valid master node
+ * @mst_mutex: protects the master node area, @mst_node, and @mst_offs
+ *
+ * @log_lebs: number of logical eraseblocks in the log
+ * @log_bytes: log size in bytes
+ * @log_last: last LEB of the log
+ * @lpt_lebs: number of LEBs used for lprops table
+ * @lpt_first: first LEB of the lprops table area
+ * @lpt_last: last LEB of the lprops table area
+ * @orph_lebs: number of LEBs used for the orphan area
+ * @orph_first: first LEB of the orphan area
+ * @orph_last: last LEB of the orphan area
+ * @main_lebs: count of LEBs in the main area
+ * @main_first: first LEB of the main area
+ * @main_bytes: main area size in bytes
+ * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
+ *
+ * @key_hash_type: type of the key hash
+ * @key_hash: direntry key hash function
+ * @key_fmt: key format
+ * @key_len: key length
+ * @fanout: fanout of the index tree (number of links per indexing node)
+ *
+ * @min_io_size: minimal input/output unit size
+ * @min_io_shift: number of bits in @min_io_size minus one
+ * @leb_size: logical eraseblock size in bytes
+ * @half_leb_size: half LEB size
+ * @leb_cnt: count of logical eraseblocks
+ * @max_leb_cnt: maximum count of logical eraseblocks
+ * @old_leb_cnt: count of logical eraseblocks before re-size
+ * @ro_media: the underlying UBI volume is read-only
+ *
+ * @dirty_pg_cnt: number of dirty pages (not used)
+ * @dirty_zn_cnt: number of dirty znodes
+ * @clean_zn_cnt: number of clean znodes
+ *
+ * @budg_idx_growth: amount of bytes budgeted for index growth
+ * @budg_data_growth: amount of bytes budgeted for cached data
+ * @budg_dd_growth: amount of bytes budgeted for cached data that will make
+ * other data dirty
+ * @budg_uncommitted_idx: amount of bytes were budgeted for growth of the index,
+ * but which still have to be taken into account because
+ * the index has not been committed so far
+ * @space_lock: protects @budg_idx_growth, @budg_data_growth, @budg_dd_growth,
+ * @budg_uncommited_idx, @min_idx_lebs, @old_idx_sz, and @lst;
+ * @min_idx_lebs: minimum number of LEBs required for the index
+ * @old_idx_sz: size of index on flash
+ * @calc_idx_sz: temporary variable which is used to calculate new index size
+ * (contains accurate new index size at end of TNC commit start)
+ * @lst: lprops statistics
+ *
+ * @page_budget: budget for a page
+ * @inode_budget: budget for an inode
+ * @dent_budget: budget for a directory entry
+ *
+ * @ref_node_alsz: size of the LEB reference node aligned to the min. flash
+ * I/O unit
+ * @mst_node_alsz: master node aligned size
+ * @min_idx_node_sz: minimum indexing node aligned on 8-bytes boundary
+ * @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary
+ * @max_inode_sz: maximum possible inode size in bytes
+ * @max_znode_sz: size of znode in bytes
+ *
+ * @leb_overhead: how many bytes are wasted in an LEB when it is filled with
+ * data nodes of maximum size - used in free space reporting
+ * @dead_wm: LEB dead space watermark
+ * @dark_wm: LEB dark space watermark
+ * @block_cnt: count of 4KiB blocks on the FS
+ *
+ * @ranges: UBIFS node length ranges
+ * @ubi: UBI volume descriptor
+ * @di: UBI device information
+ * @vi: UBI volume information
+ *
+ * @orph_tree: rb-tree of orphan inode numbers
+ * @orph_list: list of orphan inode numbers in order added
+ * @orph_new: list of orphan inode numbers added since last commit
+ * @orph_cnext: next orphan to commit
+ * @orph_dnext: next orphan to delete
+ * @orphan_lock: lock for orph_tree and orph_new
+ * @orph_buf: buffer for orphan nodes
+ * @new_orphans: number of orphans since last commit
+ * @cmt_orphans: number of orphans being committed
+ * @tot_orphans: number of orphans in the rb_tree
+ * @max_orphans: maximum number of orphans allowed
+ * @ohead_lnum: orphan head LEB number
+ * @ohead_offs: orphan head offset
+ * @no_orphs: non-zero if there are no orphans
+ *
+ * @bgt: UBIFS background thread
+ * @bgt_name: background thread name
+ * @need_bgt: if background thread should run
+ * @need_wbuf_sync: if write-buffers have to be synchronized
+ *
+ * @gc_lnum: LEB number used for garbage collection
+ * @sbuf: a buffer of LEB size used by GC and replay for scanning
+ * @idx_gc: list of index LEBs that have been garbage collected
+ * @idx_gc_cnt: number of elements on the idx_gc list
+ * @gc_seq: incremented for every non-index LEB garbage collected
+ * @gced_lnum: last non-index LEB that was garbage collected
+ *
+ * @infos_list: links all 'ubifs_info' objects
+ * @umount_mutex: serializes shrinker and un-mount
+ * @shrinker_run_no: shrinker run number
+ *
+ * @space_bits: number of bits needed to record free or dirty space
+ * @lpt_lnum_bits: number of bits needed to record a LEB number in the LPT
+ * @lpt_offs_bits: number of bits needed to record an offset in the LPT
+ * @lpt_spc_bits: number of bits needed to space in the LPT
+ * @pcnt_bits: number of bits needed to record pnode or nnode number
+ * @lnum_bits: number of bits needed to record LEB number
+ * @nnode_sz: size of on-flash nnode
+ * @pnode_sz: size of on-flash pnode
+ * @ltab_sz: size of on-flash LPT lprops table
+ * @lsave_sz: size of on-flash LPT save table
+ * @pnode_cnt: number of pnodes
+ * @nnode_cnt: number of nnodes
+ * @lpt_hght: height of the LPT
+ * @pnodes_have: number of pnodes in memory
+ *
+ * @lp_mutex: protects lprops table and all the other lprops-related fields
+ * @lpt_lnum: LEB number of the root nnode of the LPT
+ * @lpt_offs: offset of the root nnode of the LPT
+ * @nhead_lnum: LEB number of LPT head
+ * @nhead_offs: offset of LPT head
+ * @lpt_drty_flgs: dirty flags for LPT special nodes e.g. ltab
+ * @dirty_nn_cnt: number of dirty nnodes
+ * @dirty_pn_cnt: number of dirty pnodes
+ * @lpt_sz: LPT size
+ * @lpt_nod_buf: buffer for an on-flash nnode or pnode
+ * @lpt_buf: buffer of LEB size used by LPT
+ * @nroot: address in memory of the root nnode of the LPT
+ * @lpt_cnext: next LPT node to commit
+ * @lpt_heap: array of heaps of categorized lprops
+ * @dirty_idx: a (reverse sorted) copy of the LPROPS_DIRTY_IDX heap as at
+ * previous commit start
+ * @uncat_list: list of un-categorized LEBs
+ * @empty_list: list of empty LEBs
+ * @freeable_list: list of freeable non-index LEBs (free + dirty == leb_size)
+ * @frdi_idx_list: list of freeable index LEBs (free + dirty == leb_size)
+ * @freeable_cnt: number of freeable LEBs in @freeable_list
+ *
+ * @ltab_lnum: LEB number of LPT's own lprops table
+ * @ltab_offs: offset of LPT's own lprops table
+ * @ltab: LPT's own lprops table
+ * @ltab_cmt: LPT's own lprops table (commit copy)
+ * @lsave_cnt: number of LEB numbers in LPT's save table
+ * @lsave_lnum: LEB number of LPT's save table
+ * @lsave_offs: offset of LPT's save table
+ * @lsave: LPT's save table
+ * @lscan_lnum: LEB number of last LPT scan
+ *
+ * @rp_size: size of the reserved pool in bytes
+ * @report_rp_size: size of the reserved pool reported to user-space
+ * @rp_uid: reserved pool user ID
+ * @rp_gid: reserved pool group ID
+ *
+ * @empty: if the UBI device is empty
+ * @replay_tree: temporary tree used during journal replay
+ * @replay_list: temporary list used during journal replay
+ * @replay_buds: list of buds to replay
+ * @cs_sqnum: sequence number of first node in the log (commit start node)
+ * @replay_sqnum: sequence number of node currently being replayed
+ * @need_recovery: file-system needs recovery
+ * @replaying: set to %1 during journal replay
+ * @unclean_leb_list: LEBs to recover when mounting ro to rw
+ * @rcvrd_mst_node: recovered master node to write when mounting ro to rw
+ * @size_tree: inode size information for recovery
+ * @remounting_rw: set while remounting from ro to rw (sb flags have MS_RDONLY)
+ * @mount_opts: UBIFS-specific mount options
+ *
+ * @dbg_buf: a buffer of LEB size used for debugging purposes
+ * @old_zroot: old index root - used by 'dbg_check_old_index()'
+ * @old_zroot_level: old index root level - used by 'dbg_check_old_index()'
+ * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()'
+ * @failure_mode: failure mode for recovery testing
+ * @fail_delay: 0=>don't delay, 1=>delay a time, 2=>delay a number of calls
+ * @fail_timeout: time in jiffies when delay of failure mode expires
+ * @fail_cnt: current number of calls to failure mode I/O functions
+ * @fail_cnt_max: number of calls by which to delay failure mode
+ */
+struct ubifs_info {
+ struct super_block *vfs_sb;
+ struct backing_dev_info bdi;
+
+ ino_t highest_inum;
+ unsigned long long max_sqnum;
+ unsigned long long cmt_no;
+ spinlock_t cnt_lock;
+ int fmt_version;
+ unsigned char uuid[16];
+
+ int lhead_lnum;
+ int lhead_offs;
+ int ltail_lnum;
+ struct mutex log_mutex;
+ int min_log_bytes;
+ long long cmt_bud_bytes;
+
+ struct rb_root buds;
+ long long bud_bytes;
+ spinlock_t buds_lock;
+ int jhead_cnt;
+ struct ubifs_jhead *jheads;
+ long long max_bud_bytes;
+ long long bg_bud_bytes;
+ struct list_head old_buds;
+ int max_bud_cnt;
+
+ struct rw_semaphore commit_sem;
+ int cmt_state;
+ spinlock_t cs_lock;
+ wait_queue_head_t cmt_wq;
+ unsigned int fast_unmount:1;
+ unsigned int big_lpt:1;
+ unsigned int check_lpt_free:1;
+ unsigned int nospace:1;
+ unsigned int nospace_rp:1;
+
+ struct mutex tnc_mutex;
+ struct ubifs_zbranch zroot;
+ struct ubifs_znode *cnext;
+ struct ubifs_znode *enext;
+ int *gap_lebs;
+ void *cbuf;
+ void *ileb_buf;
+ int ileb_len;
+ int ihead_lnum;
+ int ihead_offs;
+ int *ilebs;
+ int ileb_cnt;
+ int ileb_nxt;
+ struct rb_root old_idx;
+ int *bottom_up_buf;
+#ifdef CONFIG_UBIFS_FS_DEBUG
+ int new_ihead_lnum;
+ int new_ihead_offs;
+#endif
+
+ struct ubifs_mst_node *mst_node;
+ int mst_offs;
+ struct mutex mst_mutex;
+
+ int log_lebs;
+ long long log_bytes;
+ int log_last;
+ int lpt_lebs;
+ int lpt_first;
+ int lpt_last;
+ int orph_lebs;
+ int orph_first;
+ int orph_last;
+ int main_lebs;
+ int main_first;
+ long long main_bytes;
+ int default_compr;
+
+ uint8_t key_hash_type;
+ uint32_t (*key_hash)(const char *str, int len);
+ int key_fmt;
+ int key_len;
+ int fanout;
+
+ int min_io_size;
+ int min_io_shift;
+ int leb_size;
+ int half_leb_size;
+ int leb_cnt;
+ int max_leb_cnt;
+ int old_leb_cnt;
+ int ro_media;
+
+ atomic_long_t dirty_pg_cnt;
+ atomic_long_t dirty_zn_cnt;
+ atomic_long_t clean_zn_cnt;
+
+ long long budg_idx_growth;
+ long long budg_data_growth;
+ long long budg_dd_growth;
+ long long budg_uncommitted_idx;
+ spinlock_t space_lock;
+ int min_idx_lebs;
+ unsigned long long old_idx_sz;
+ unsigned long long calc_idx_sz;
+ struct ubifs_lp_stats lst;
+
+ int page_budget;
+ int inode_budget;
+ int dent_budget;
+
+ int ref_node_alsz;
+ int mst_node_alsz;
+ int min_idx_node_sz;
+ int max_idx_node_sz;
+ long long max_inode_sz;
+ int max_znode_sz;
+
+ int leb_overhead;
+ int dead_wm;
+ int dark_wm;
+ int block_cnt;
+
+ struct ubifs_node_range ranges[UBIFS_NODE_TYPES_CNT];
+ struct ubi_volume_desc *ubi;
+ struct ubi_device_info di;
+ struct ubi_volume_info vi;
+
+ struct rb_root orph_tree;
+ struct list_head orph_list;
+ struct list_head orph_new;
+ struct ubifs_orphan *orph_cnext;
+ struct ubifs_orphan *orph_dnext;
+ spinlock_t orphan_lock;
+ void *orph_buf;
+ int new_orphans;
+ int cmt_orphans;
+ int tot_orphans;
+ int max_orphans;
+ int ohead_lnum;
+ int ohead_offs;
+ int no_orphs;
+
+ struct task_struct *bgt;
+ char bgt_name[sizeof(BGT_NAME_PATTERN) + 9];
+ int need_bgt;
+ int need_wbuf_sync;
+
+ int gc_lnum;
+ void *sbuf;
+ struct list_head idx_gc;
+ int idx_gc_cnt;
+ volatile int gc_seq;
+ volatile int gced_lnum;
+
+ struct list_head infos_list;
+ struct mutex umount_mutex;
+ unsigned int shrinker_run_no;
+
+ int space_bits;
+ int lpt_lnum_bits;
+ int lpt_offs_bits;
+ int lpt_spc_bits;
+ int pcnt_bits;
+ int lnum_bits;
+ int nnode_sz;
+ int pnode_sz;
+ int ltab_sz;
+ int lsave_sz;
+ int pnode_cnt;
+ int nnode_cnt;
+ int lpt_hght;
+ int pnodes_have;
+
+ struct mutex lp_mutex;
+ int lpt_lnum;
+ int lpt_offs;
+ int nhead_lnum;
+ int nhead_offs;
+ int lpt_drty_flgs;
+ int dirty_nn_cnt;
+ int dirty_pn_cnt;
+ long long lpt_sz;
+ void *lpt_nod_buf;
+ void *lpt_buf;
+ struct ubifs_nnode *nroot;
+ struct ubifs_cnode *lpt_cnext;
+ struct ubifs_lpt_heap lpt_heap[LPROPS_HEAP_CNT];
+ struct ubifs_lpt_heap dirty_idx;
+ struct list_head uncat_list;
+ struct list_head empty_list;
+ struct list_head freeable_list;
+ struct list_head frdi_idx_list;
+ int freeable_cnt;
+
+ int ltab_lnum;
+ int ltab_offs;
+ struct ubifs_lpt_lprops *ltab;
+ struct ubifs_lpt_lprops *ltab_cmt;
+ int lsave_cnt;
+ int lsave_lnum;
+ int lsave_offs;
+ int *lsave;
+ int lscan_lnum;
+
+ long long rp_size;
+ long long report_rp_size;
+ uid_t rp_uid;
+ gid_t rp_gid;
+
+ /* The below fields are used only during mounting and re-mounting */
+ int empty;
+ struct rb_root replay_tree;
+ struct list_head replay_list;
+ struct list_head replay_buds;
+ unsigned long long cs_sqnum;
+ unsigned long long replay_sqnum;
+ int need_recovery;
+ int replaying;
+ struct list_head unclean_leb_list;
+ struct ubifs_mst_node *rcvrd_mst_node;
+ struct rb_root size_tree;
+ int remounting_rw;
+ struct ubifs_mount_opts mount_opts;
+
+#ifdef CONFIG_UBIFS_FS_DEBUG
+ void *dbg_buf;
+ struct ubifs_zbranch old_zroot;
+ int old_zroot_level;
+ unsigned long long old_zroot_sqnum;
+ int failure_mode;
+ int fail_delay;
+ unsigned long fail_timeout;
+ unsigned int fail_cnt;
+ unsigned int fail_cnt_max;
+#endif
+};
+
+extern struct list_head ubifs_infos;
+extern spinlock_t ubifs_infos_lock;
+extern atomic_long_t ubifs_clean_zn_cnt;
+extern struct kmem_cache *ubifs_inode_slab;
+extern struct super_operations ubifs_super_operations;
+extern struct address_space_operations ubifs_file_address_operations;
+extern struct file_operations ubifs_file_operations;
+extern struct inode_operations ubifs_file_inode_operations;
+extern struct file_operations ubifs_dir_operations;
+extern struct inode_operations ubifs_dir_inode_operations;
+extern struct inode_operations ubifs_symlink_inode_operations;
+extern struct backing_dev_info ubifs_backing_dev_info;
+extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
+
+/* io.c */
+void ubifs_ro_mode(struct ubifs_info *c, int err);
+int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len);
+int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
+ int dtype);
+int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf);
+int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len,
+ int lnum, int offs);
+int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
+ int lnum, int offs);
+int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum,
+ int offs, int dtype);
+int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
+ int offs, int quiet);
+void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad);
+void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last);
+int ubifs_io_init(struct ubifs_info *c);
+void ubifs_pad(const struct ubifs_info *c, void *buf, int pad);
+int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf);
+int ubifs_bg_wbufs_sync(struct ubifs_info *c);
+void ubifs_wbuf_add_ino_nolock(struct ubifs_wbuf *wbuf, ino_t inum);
+int ubifs_sync_wbufs_by_inode(struct ubifs_info *c, struct inode *inode);
+
+/* scan.c */
+struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
+ int offs, void *sbuf);
+void ubifs_scan_destroy(struct ubifs_scan_leb *sleb);
+int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
+ int offs, int quiet);
+struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum,
+ int offs, void *sbuf);
+void ubifs_end_scan(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+ int lnum, int offs);
+int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
+ void *buf, int offs);
+void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs,
+ void *buf);
+
+/* log.c */
+void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud);
+void ubifs_create_buds_lists(struct ubifs_info *c);
+int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs);
+struct ubifs_bud *ubifs_search_bud(struct ubifs_info *c, int lnum);
+struct ubifs_wbuf *ubifs_get_wbuf(struct ubifs_info *c, int lnum);
+int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum);
+int ubifs_log_end_commit(struct ubifs_info *c, int new_ltail_lnum);
+int ubifs_log_post_commit(struct ubifs_info *c, int old_ltail_lnum);
+int ubifs_consolidate_log(struct ubifs_info *c);
+
+/* journal.c */
+int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
+ const struct qstr *nm, const struct inode *inode,
+ int deletion, int xent);
+int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
+ const union ubifs_key *key, const void *buf, int len);
+int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode);
+int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode);
+int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
+ const struct dentry *old_dentry,
+ const struct inode *new_dir,
+ const struct dentry *new_dentry, int sync);
+int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
+ loff_t old_size, loff_t new_size);
+int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
+ const struct inode *inode, const struct qstr *nm);
+int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode1,
+ const struct inode *inode2);
+
+/* budget.c */
+int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req);
+void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req);
+void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
+ struct ubifs_inode *ui);
+int ubifs_budget_inode_op(struct ubifs_info *c, struct inode *inode,
+ struct ubifs_budget_req *req);
+void ubifs_release_ino_dirty(struct ubifs_info *c, struct inode *inode,
+ struct ubifs_budget_req *req);
+void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode,
+ struct ubifs_budget_req *req);
+long long ubifs_get_free_space(struct ubifs_info *c);
+int ubifs_calc_min_idx_lebs(struct ubifs_info *c);
+void ubifs_convert_page_budget(struct ubifs_info *c);
+long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free);
+long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs);
+
+/* find.c */
+int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
+ int squeeze);
+int ubifs_find_free_leb_for_idx(struct ubifs_info *c);
+int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
+ int min_space, int pick_free);
+int ubifs_find_dirty_idx_leb(struct ubifs_info *c);
+int ubifs_save_dirty_idx_lnums(struct ubifs_info *c);
+
+/* tnc.c */
+int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
+ struct ubifs_znode **zn, int *n);
+int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
+ void *node, const struct qstr *nm);
+int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
+ void *node, int *lnum, int *offs);
+int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum,
+ int offs, int len);
+int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key,
+ int old_lnum, int old_offs, int lnum, int offs, int len);
+int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
+ int lnum, int offs, int len, const struct qstr *nm);
+int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key);
+int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key,
+ const struct qstr *nm);
+int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key,
+ union ubifs_key *to_key);
+int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum);
+struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c,
+ union ubifs_key *key,
+ const struct qstr *nm);
+void ubifs_tnc_close(struct ubifs_info *c);
+int ubifs_tnc_has_node(struct ubifs_info *c, union ubifs_key *key, int level,
+ int lnum, int offs, int is_idx);
+int ubifs_dirty_idx_node(struct ubifs_info *c, union ubifs_key *key, int level,
+ int lnum, int offs);
+/* Shared by tnc.c for tnc_commit.c */
+void destroy_old_idx(struct ubifs_info *c);
+int is_idx_node_in_tnc(struct ubifs_info *c, union ubifs_key *key, int level,
+ int lnum, int offs);
+int insert_old_idx_znode(struct ubifs_info *c, struct ubifs_znode *znode);
+
+/* tnc_misc.c */
+struct ubifs_znode *ubifs_tnc_levelorder_next(struct ubifs_znode *zr,
+ struct ubifs_znode *znode);
+int ubifs_search_zbranch(const struct ubifs_info *c,
+ const struct ubifs_znode *znode,
+ const union ubifs_key *key, int *n);
+struct ubifs_znode *ubifs_tnc_postorder_first(struct ubifs_znode *znode);
+struct ubifs_znode *ubifs_tnc_postorder_next(struct ubifs_znode *znode);
+long ubifs_destroy_tnc_subtree(struct ubifs_znode *zr);
+struct ubifs_znode *ubifs_load_znode(struct ubifs_info *c,
+ struct ubifs_zbranch *zbr,
+ struct ubifs_znode *parent, int iip);
+int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+ void *node);
+
+/* tnc_commit.c */
+int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot);
+int ubifs_tnc_end_commit(struct ubifs_info *c);
+
+/* shrinker.c */
+int ubifs_shrinker(int nr_to_scan, gfp_t gfp_mask);
+
+/* commit.c */
+int ubifs_bg_thread(void *info);
+void ubifs_commit_required(struct ubifs_info *c);
+void ubifs_request_bg_commit(struct ubifs_info *c);
+int ubifs_run_commit(struct ubifs_info *c);
+void ubifs_recovery_commit(struct ubifs_info *c);
+int ubifs_gc_should_commit(struct ubifs_info *c);
+void ubifs_wait_for_commit(struct ubifs_info *c);
+
+/* master.c */
+int ubifs_read_master(struct ubifs_info *c);
+int ubifs_write_master(struct ubifs_info *c);
+
+/* sb.c */
+int ubifs_read_superblock(struct ubifs_info *c);
+struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c);
+int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup);
+
+/* replay.c */
+int ubifs_validate_entry(struct ubifs_info *c,
+ const struct ubifs_dent_node *dent);
+int ubifs_replay_journal(struct ubifs_info *c);
+
+/* gc.c */
+int ubifs_garbage_collect(struct ubifs_info *c, int anyway);
+int ubifs_gc_start_commit(struct ubifs_info *c);
+int ubifs_gc_end_commit(struct ubifs_info *c);
+void ubifs_destroy_idx_gc(struct ubifs_info *c);
+int ubifs_get_idx_gc_leb(struct ubifs_info *c);
+int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp);
+
+/* orphan.c */
+int ubifs_add_orphan(struct ubifs_info *c, ino_t inum);
+void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum);
+int ubifs_orphan_start_commit(struct ubifs_info *c);
+int ubifs_orphan_end_commit(struct ubifs_info *c);
+int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only);
+
+/* lpt.c */
+int ubifs_calc_lpt_geom(struct ubifs_info *c);
+int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first,
+ int *lpt_lebs, int *big_lpt);
+int ubifs_lpt_init(struct ubifs_info *c, int rd, int wr);
+struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum);
+struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum);
+int ubifs_lpt_scan_nolock(struct ubifs_info *c, int start_lnum, int end_lnum,
+ ubifs_lpt_scan_callback scan_cb, void *data);
+
+/* Shared by lpt.c for lpt_commit.c */
+void ubifs_pack_lsave(struct ubifs_info *c, void *buf, int *lsave);
+void ubifs_pack_ltab(struct ubifs_info *c, void *buf,
+ struct ubifs_lpt_lprops *ltab);
+void ubifs_pack_pnode(struct ubifs_info *c, void *buf,
+ struct ubifs_pnode *pnode);
+void ubifs_pack_nnode(struct ubifs_info *c, void *buf,
+ struct ubifs_nnode *nnode);
+struct ubifs_pnode *ubifs_get_pnode(struct ubifs_info *c,
+ struct ubifs_nnode *parent, int iip);
+struct ubifs_nnode *ubifs_get_nnode(struct ubifs_info *c,
+ struct ubifs_nnode *parent, int iip);
+int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip);
+void ubifs_add_lpt_dirt(struct ubifs_info *c, int lnum, int dirty);
+void ubifs_add_nnode_dirt(struct ubifs_info *c, struct ubifs_nnode *nnode);
+uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits);
+struct ubifs_nnode *ubifs_first_nnode(struct ubifs_info *c, int *hght);
+
+/* lpt_commit.c */
+int ubifs_lpt_start_commit(struct ubifs_info *c);
+int ubifs_lpt_end_commit(struct ubifs_info *c);
+int ubifs_lpt_post_commit(struct ubifs_info *c);
+void ubifs_lpt_free(struct ubifs_info *c, int wr_only);
+
+/* lprops.c */
+void ubifs_get_lprops(struct ubifs_info *c);
+const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
+ const struct ubifs_lprops *lp,
+ int free, int dirty, int flags,
+ int idx_gc_cnt);
+void ubifs_release_lprops(struct ubifs_info *c);
+void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *stats);
+void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops,
+ int cat);
+void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops,
+ struct ubifs_lprops *new_lprops);
+void ubifs_ensure_cat(struct ubifs_info *c, struct ubifs_lprops *lprops);
+int ubifs_categorize_lprops(const struct ubifs_info *c,
+ const struct ubifs_lprops *lprops);
+int ubifs_change_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
+ int flags_set, int flags_clean, int idx_gc_cnt);
+int ubifs_update_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
+ int flags_set, int flags_clean);
+int ubifs_read_one_lp(struct ubifs_info *c, int lnum, struct ubifs_lprops *lp);
+const struct ubifs_lprops *ubifs_fast_find_free(struct ubifs_info *c);
+const struct ubifs_lprops *ubifs_fast_find_empty(struct ubifs_info *c);
+const struct ubifs_lprops *ubifs_fast_find_freeable(struct ubifs_info *c);
+const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c);
+
+/* file.c */
+int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync);
+int ubifs_setattr(struct dentry *dentry, struct iattr *attr);
+
+/* dir.c */
+struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
+ int mode);
+int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat);
+
+/* xattr.c */
+int ubifs_setxattr(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags);
+ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
+ size_t size);
+ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size);
+int ubifs_removexattr(struct dentry *dentry, const char *name);
+
+/* super.c */
+struct inode *ubifs_iget(struct super_block *sb, unsigned long inum);
+
+/* recovery.c */
+int ubifs_recover_master_node(struct ubifs_info *c);
+int ubifs_write_rcvrd_mst_node(struct ubifs_info *c);
+struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
+ int offs, void *sbuf, int grouped);
+struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
+ int offs, void *sbuf);
+int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf);
+int ubifs_clean_lebs(const struct ubifs_info *c, void *sbuf);
+int ubifs_rcvry_gc_commit(struct ubifs_info *c);
+int ubifs_recover_size_accum(struct ubifs_info *c, union ubifs_key *key,
+ int deletion, loff_t new_size);
+int ubifs_recover_size(struct ubifs_info *c);
+void ubifs_destroy_size_tree(struct ubifs_info *c);
+
+/* ioctl.c */
+long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+void ubifs_set_inode_flags(struct inode *inode);
+#ifdef CONFIG_COMPAT
+long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+#endif
+
+/* compressor.c */
+int __init ubifs_compressors_init(void);
+void __exit ubifs_compressors_exit(void);
+void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
+ int *compr_type);
+int ubifs_decompress(const void *buf, int len, void *out, int *out_len,
+ int compr_type);
+
+#include "debug.h"
+#include "misc.h"
+#include "key.h"
+
+#endif /* !__UBIFS_H__ */
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
new file mode 100644
index 00000000000..649bec78b64
--- /dev/null
+++ b/fs/ubifs/xattr.c
@@ -0,0 +1,571 @@
+/*
+ * This file is part of UBIFS.
+ *
+ * Copyright (C) 2006-2008 Nokia Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 51
+ * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Authors: Artem Bityutskiy (Битюцкий Артём)
+ * Adrian Hunter
+ */
+
+/*
+ * This file implements UBIFS extended attributes support.
+ *
+ * Extended attributes are implemented as regular inodes with attached data,
+ * which limits extended attribute size to UBIFS block size (4KiB). Names of
+ * extended attributes are described by extended attribute entries (xentries),
+ * which are almost identical to directory entries, but have different key type.
+ *
+ * In other words, the situation with extended attributes is very similar to
+ * directories. Indeed, any inode (but of course not xattr inodes) may have a
+ * number of associated xentries, just like directory inodes have associated
+ * directory entries. Extended attribute entries store the name of the extended
+ * attribute, the host inode number, and the extended attribute inode number.
+ * Similarly, direntries store the name, the parent and the target inode
+ * numbers. Thus, most of the common UBIFS mechanisms may be re-used for
+ * extended attributes.
+ *
+ * The number of extended attributes is not limited, but there is Linux
+ * limitation on the maximum possible size of the list of all extended
+ * attributes associated with an inode (%XATTR_LIST_MAX), so UBIFS makes sure
+ * the sum of all extended attribute names of the inode does not exceed that
+ * limit.
+ *
+ * Extended attributes are synchronous, which means they are written to the
+ * flash media synchronously and there is no write-back for extended attribute
+ * inodes. The extended attribute values are not stored in compressed form on
+ * the media.
+ *
+ * Since extended attributes are represented by regular inodes, they are cached
+ * in the VFS inode cache. The xentries are cached in the LNC cache (see
+ * tnc.c).
+ *
+ * ACL support is not implemented.
+ */
+
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+#include "ubifs.h"
+
+/*
+ * Limit the number of extended attributes per inode so that the total size
+ * (@xattr_size) is guaranteeded to fit in an 'unsigned int'.
+ */
+#define MAX_XATTRS_PER_INODE 65535
+
+/*
+ * Extended attribute type constants.
+ *
+ * USER_XATTR: user extended attribute ("user.*")
+ * TRUSTED_XATTR: trusted extended attribute ("trusted.*)
+ * SECURITY_XATTR: security extended attribute ("security.*")
+ */
+enum {
+ USER_XATTR,
+ TRUSTED_XATTR,
+ SECURITY_XATTR,
+};
+
+static struct inode_operations none_inode_operations;
+static struct address_space_operations none_address_operations;
+static struct file_operations none_file_operations;
+
+/**
+ * create_xattr - create an extended attribute.
+ * @c: UBIFS file-system description object
+ * @host: host inode
+ * @nm: extended attribute name
+ * @value: extended attribute value
+ * @size: size of extended attribute value
+ *
+ * This is a helper function which creates an extended attribute of name @nm
+ * and value @value for inode @host. The host inode is also updated on flash
+ * because the ctime and extended attribute accounting data changes. This
+ * function returns zero in case of success and a negative error code in case
+ * of failure.
+ */
+static int create_xattr(struct ubifs_info *c, struct inode *host,
+ const struct qstr *nm, const void *value, int size)
+{
+ int err;
+ struct inode *inode;
+ struct ubifs_inode *ui, *host_ui = ubifs_inode(host);
+ struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
+ .new_ino_d = ALIGN(size, 8), .dirtied_ino = 1,
+ .dirtied_ino_d = ALIGN(host_ui->data_len, 8) };
+
+ if (host_ui->xattr_cnt >= MAX_XATTRS_PER_INODE)
+ return -ENOSPC;
+ /*
+ * Linux limits the maximum size of the extended attribute names list
+ * to %XATTR_LIST_MAX. This means we should not allow creating more
+ * extended attributes if the name list becomes larger. This limitation
+ * is artificial for UBIFS, though.
+ */
+ if (host_ui->xattr_names + host_ui->xattr_cnt +
+ nm->len + 1 > XATTR_LIST_MAX)
+ return -ENOSPC;
+
+ err = ubifs_budget_space(c, &req);
+ if (err)
+ return err;
+
+ inode = ubifs_new_inode(c, host, S_IFREG | S_IRWXUGO);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto out_budg;
+ }
+
+ /* Re-define all operations to be "nothing" */
+ inode->i_mapping->a_ops = &none_address_operations;
+ inode->i_op = &none_inode_operations;
+ inode->i_fop = &none_file_operations;
+
+ inode->i_flags |= S_SYNC | S_NOATIME | S_NOCMTIME | S_NOQUOTA;
+ ui = ubifs_inode(inode);
+ ui->xattr = 1;
+ ui->flags |= UBIFS_XATTR_FL;
+ ui->data = kmalloc(size, GFP_NOFS);
+ if (!ui->data) {
+ err = -ENOMEM;
+ goto out_free;
+ }
+ memcpy(ui->data, value, size);
+ inode->i_size = ui->ui_size = size;
+ ui->data_len = size;
+
+ mutex_lock(&host_ui->ui_mutex);
+ host->i_ctime = ubifs_current_time(host);
+ host_ui->xattr_cnt += 1;
+ host_ui->xattr_size += CALC_DENT_SIZE(nm->len);
+ host_ui->xattr_size += CALC_XATTR_BYTES(size);
+ host_ui->xattr_names += nm->len;
+
+ err = ubifs_jnl_update(c, host, nm, inode, 0, 1);
+ if (err)
+ goto out_cancel;
+ mutex_unlock(&host_ui->ui_mutex);
+
+ ubifs_release_budget(c, &req);
+ insert_inode_hash(inode);
+ iput(inode);
+ return 0;
+
+out_cancel:
+ host_ui->xattr_cnt -= 1;
+ host_ui->xattr_size -= CALC_DENT_SIZE(nm->len);
+ host_ui->xattr_size -= CALC_XATTR_BYTES(size);
+ mutex_unlock(&host_ui->ui_mutex);
+out_free:
+ make_bad_inode(inode);
+ iput(inode);
+out_budg:
+ ubifs_release_budget(c, &req);
+ return err;
+}
+
+/**
+ * change_xattr - change an extended attribute.
+ * @c: UBIFS file-system description object
+ * @host: host inode
+ * @inode: extended attribute inode
+ * @value: extended attribute value
+ * @size: size of extended attribute value
+ *
+ * This helper function changes the value of extended attribute @inode with new
+ * data from @value. Returns zero in case of success and a negative error code
+ * in case of failure.
+ */
+static int change_xattr(struct ubifs_info *c, struct inode *host,
+ struct inode *inode, const void *value, int size)
+{
+ int err;
+ struct ubifs_inode *host_ui = ubifs_inode(host);
+ struct ubifs_inode *ui = ubifs_inode(inode);
+ struct ubifs_budget_req req = { .dirtied_ino = 2,
+ .dirtied_ino_d = ALIGN(size, 8) + ALIGN(host_ui->data_len, 8) };
+
+ ubifs_assert(ui->data_len == inode->i_size);
+ err = ubifs_budget_space(c, &req);
+ if (err)
+ return err;
+
+ kfree(ui->data);
+ ui->data = kmalloc(size, GFP_NOFS);
+ if (!ui->data) {
+ err = -ENOMEM;
+ goto out_free;
+ }
+ memcpy(ui->data, value, size);
+ inode->i_size = ui->ui_size = size;
+ ui->data_len = size;
+
+ mutex_lock(&host_ui->ui_mutex);
+ host->i_ctime = ubifs_current_time(host);
+ host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len);
+ host_ui->xattr_size += CALC_XATTR_BYTES(size);
+
+ /*
+ * It is important to write the host inode after the xattr inode
+ * because if the host inode gets synchronized (via 'fsync()'), then
+ * the extended attribute inode gets synchronized, because it goes
+ * before the host inode in the write-buffer.
+ */
+ err = ubifs_jnl_change_xattr(c, inode, host);
+ if (err)
+ goto out_cancel;
+ mutex_unlock(&host_ui->ui_mutex);
+
+ ubifs_release_budget(c, &req);
+ return 0;
+
+out_cancel:
+ host_ui->xattr_size -= CALC_XATTR_BYTES(size);
+ host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len);
+ mutex_unlock(&host_ui->ui_mutex);
+ make_bad_inode(inode);
+out_free:
+ ubifs_release_budget(c, &req);
+ return err;
+}
+
+/**
+ * check_namespace - check extended attribute name-space.
+ * @nm: extended attribute name
+ *
+ * This function makes sure the extended attribute name belongs to one of the
+ * supported extended attribute name-spaces. Returns name-space index in case
+ * of success and a negative error code in case of failure.
+ */
+static int check_namespace(const struct qstr *nm)
+{
+ int type;
+
+ if (nm->len > UBIFS_MAX_NLEN)
+ return -ENAMETOOLONG;
+
+ if (!strncmp(nm->name, XATTR_TRUSTED_PREFIX,
+ XATTR_TRUSTED_PREFIX_LEN)) {
+ if (nm->name[sizeof(XATTR_TRUSTED_PREFIX) - 1] == '\0')
+ return -EINVAL;
+ type = TRUSTED_XATTR;
+ } else if (!strncmp(nm->name, XATTR_USER_PREFIX,
+ XATTR_USER_PREFIX_LEN)) {
+ if (nm->name[XATTR_USER_PREFIX_LEN] == '\0')
+ return -EINVAL;
+ type = USER_XATTR;
+ } else if (!strncmp(nm->name, XATTR_SECURITY_PREFIX,
+ XATTR_SECURITY_PREFIX_LEN)) {
+ if (nm->name[sizeof(XATTR_SECURITY_PREFIX) - 1] == '\0')
+ return -EINVAL;
+ type = SECURITY_XATTR;
+ } else
+ return -EOPNOTSUPP;
+
+ return type;
+}
+
+static struct inode *iget_xattr(struct ubifs_info *c, ino_t inum)
+{
+ struct inode *inode;
+
+ inode = ubifs_iget(c->vfs_sb, inum);
+ if (IS_ERR(inode)) {
+ ubifs_err("dead extended attribute entry, error %d",
+ (int)PTR_ERR(inode));
+ return inode;
+ }
+ if (ubifs_inode(inode)->xattr)
+ return inode;
+ ubifs_err("corrupt extended attribute entry");
+ iput(inode);
+ return ERR_PTR(-EINVAL);
+}
+
+int ubifs_setxattr(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
+{
+ struct inode *inode, *host = dentry->d_inode;
+ struct ubifs_info *c = host->i_sb->s_fs_info;
+ struct qstr nm = { .name = name, .len = strlen(name) };
+ struct ubifs_dent_node *xent;
+ union ubifs_key key;
+ int err, type;
+
+ dbg_gen("xattr '%s', host ino %lu ('%.*s'), size %zd", name,
+ host->i_ino, dentry->d_name.len, dentry->d_name.name, size);
+ ubifs_assert(mutex_is_locked(&host->i_mutex));
+
+ if (size > UBIFS_MAX_INO_DATA)
+ return -ERANGE;
+
+ type = check_namespace(&nm);
+ if (type < 0)
+ return type;
+
+ xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS);
+ if (!xent)
+ return -ENOMEM;
+
+ /*
+ * The extended attribute entries are stored in LNC, so multiple
+ * look-ups do not involve reading the flash.
+ */
+ xent_key_init(c, &key, host->i_ino, &nm);
+ err = ubifs_tnc_lookup_nm(c, &key, xent, &nm);
+ if (err) {
+ if (err != -ENOENT)
+ goto out_free;
+
+ if (flags & XATTR_REPLACE)
+ /* We are asked not to create the xattr */
+ err = -ENODATA;
+ else
+ err = create_xattr(c, host, &nm, value, size);
+ goto out_free;
+ }
+
+ if (flags & XATTR_CREATE) {
+ /* We are asked not to replace the xattr */
+ err = -EEXIST;
+ goto out_free;
+ }
+
+ inode = iget_xattr(c, le64_to_cpu(xent->inum));
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto out_free;
+ }
+
+ err = change_xattr(c, host, inode, value, size);
+ iput(inode);
+
+out_free:
+ kfree(xent);
+ return err;
+}
+
+ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
+ size_t size)
+{
+ struct inode *inode, *host = dentry->d_inode;
+ struct ubifs_info *c = host->i_sb->s_fs_info;
+ struct qstr nm = { .name = name, .len = strlen(name) };
+ struct ubifs_inode *ui;
+ struct ubifs_dent_node *xent;
+ union ubifs_key key;
+ int err;
+
+ dbg_gen("xattr '%s', ino %lu ('%.*s'), buf size %zd", name,
+ host->i_ino, dentry->d_name.len, dentry->d_name.name, size);
+
+ err = check_namespace(&nm);
+ if (err < 0)
+ return err;
+
+ xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS);
+ if (!xent)
+ return -ENOMEM;
+
+ xent_key_init(c, &key, host->i_ino, &nm);
+ err = ubifs_tnc_lookup_nm(c, &key, xent, &nm);
+ if (err) {
+ if (err == -ENOENT)
+ err = -ENODATA;
+ goto out_unlock;
+ }
+
+ inode = iget_xattr(c, le64_to_cpu(xent->inum));
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto out_unlock;
+ }
+
+ ui = ubifs_inode(inode);
+ ubifs_assert(inode->i_size == ui->data_len);
+ ubifs_assert(ubifs_inode(host)->xattr_size > ui->data_len);
+
+ if (buf) {
+ /* If @buf is %NULL we are supposed to return the length */
+ if (ui->data_len > size) {
+ dbg_err("buffer size %zd, xattr len %d",
+ size, ui->data_len);
+ err = -ERANGE;
+ goto out_iput;
+ }
+
+ memcpy(buf, ui->data, ui->data_len);
+ }
+ err = ui->data_len;
+
+out_iput:
+ iput(inode);
+out_unlock:
+ kfree(xent);
+ return err;
+}
+
+ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+ union ubifs_key key;
+ struct inode *host = dentry->d_inode;
+ struct ubifs_info *c = host->i_sb->s_fs_info;
+ struct ubifs_inode *host_ui = ubifs_inode(host);
+ struct ubifs_dent_node *xent, *pxent = NULL;
+ int err, len, written = 0;
+ struct qstr nm = { .name = NULL };
+
+ dbg_gen("ino %lu ('%.*s'), buffer size %zd", host->i_ino,
+ dentry->d_name.len, dentry->d_name.name, size);
+
+ len = host_ui->xattr_names + host_ui->xattr_cnt;
+ if (!buffer)
+ /*
+ * We should return the minimum buffer size which will fit a
+ * null-terminated list of all the extended attribute names.
+ */
+ return len;
+
+ if (len > size)
+ return -ERANGE;
+
+ lowest_xent_key(c, &key, host->i_ino);
+ while (1) {
+ int type;
+
+ xent = ubifs_tnc_next_ent(c, &key, &nm);
+ if (unlikely(IS_ERR(xent))) {
+ err = PTR_ERR(xent);
+ break;
+ }
+
+ nm.name = xent->name;
+ nm.len = le16_to_cpu(xent->nlen);
+
+ type = check_namespace(&nm);
+ if (unlikely(type < 0)) {
+ err = type;
+ break;
+ }
+
+ /* Show trusted namespace only for "power" users */
+ if (type != TRUSTED_XATTR || capable(CAP_SYS_ADMIN)) {
+ memcpy(buffer + written, nm.name, nm.len + 1);
+ written += nm.len + 1;
+ }
+
+ kfree(pxent);
+ pxent = xent;
+ key_read(c, &xent->key, &key);
+ }
+
+ kfree(pxent);
+ if (err != -ENOENT) {
+ ubifs_err("cannot find next direntry, error %d", err);
+ return err;
+ }
+
+ ubifs_assert(written <= size);
+ return written;
+}
+
+static int remove_xattr(struct ubifs_info *c, struct inode *host,
+ struct inode *inode, const struct qstr *nm)
+{
+ int err;
+ struct ubifs_inode *host_ui = ubifs_inode(host);
+ struct ubifs_inode *ui = ubifs_inode(inode);
+ struct ubifs_budget_req req = { .dirtied_ino = 2, .mod_dent = 1,
+ .dirtied_ino_d = ALIGN(host_ui->data_len, 8) };
+
+ ubifs_assert(ui->data_len == inode->i_size);
+
+ err = ubifs_budget_space(c, &req);
+ if (err)
+ return err;
+
+ mutex_lock(&host_ui->ui_mutex);
+ host->i_ctime = ubifs_current_time(host);
+ host_ui->xattr_cnt -= 1;
+ host_ui->xattr_size -= CALC_DENT_SIZE(nm->len);
+ host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len);
+ host_ui->xattr_names -= nm->len;
+
+ err = ubifs_jnl_delete_xattr(c, host, inode, nm);
+ if (err)
+ goto out_cancel;
+ mutex_unlock(&host_ui->ui_mutex);
+
+ ubifs_release_budget(c, &req);
+ return 0;
+
+out_cancel:
+ host_ui->xattr_cnt += 1;
+ host_ui->xattr_size += CALC_DENT_SIZE(nm->len);
+ host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len);
+ mutex_unlock(&host_ui->ui_mutex);
+ ubifs_release_budget(c, &req);
+ make_bad_inode(inode);
+ return err;
+}
+
+int ubifs_removexattr(struct dentry *dentry, const char *name)
+{
+ struct inode *inode, *host = dentry->d_inode;
+ struct ubifs_info *c = host->i_sb->s_fs_info;
+ struct qstr nm = { .name = name, .len = strlen(name) };
+ struct ubifs_dent_node *xent;
+ union ubifs_key key;
+ int err;
+
+ dbg_gen("xattr '%s', ino %lu ('%.*s')", name,
+ host->i_ino, dentry->d_name.len, dentry->d_name.name);
+ ubifs_assert(mutex_is_locked(&host->i_mutex));
+
+ err = check_namespace(&nm);
+ if (err < 0)
+ return err;
+
+ xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS);
+ if (!xent)
+ return -ENOMEM;
+
+ xent_key_init(c, &key, host->i_ino, &nm);
+ err = ubifs_tnc_lookup_nm(c, &key, xent, &nm);
+ if (err) {
+ if (err == -ENOENT)
+ err = -ENODATA;
+ goto out_free;
+ }
+
+ inode = iget_xattr(c, le64_to_cpu(xent->inum));
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto out_free;
+ }
+
+ ubifs_assert(inode->i_nlink == 1);
+ inode->i_nlink = 0;
+ err = remove_xattr(c, host, inode, &nm);
+ if (err)
+ inode->i_nlink = 1;
+
+ /* If @i_nlink is 0, 'iput()' will delete the inode */
+ iput(inode);
+
+out_free:
+ kfree(xent);
+ return err;
+}
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 0ed6e146a0d..eb91f3b7032 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -211,6 +211,7 @@ const struct file_operations udf_file_operations = {
.release = udf_release_file,
.fsync = udf_fsync_file,
.splice_read = generic_file_splice_read,
+ .llseek = generic_file_llseek,
};
const struct inode_operations udf_file_inode_operations = {
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index eb9cfa23dc3..a4f2b3ce45b 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -76,11 +76,24 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
*err = -ENOSPC;
iinfo = UDF_I(inode);
- iinfo->i_unique = 0;
- iinfo->i_lenExtents = 0;
- iinfo->i_next_alloc_block = 0;
- iinfo->i_next_alloc_goal = 0;
- iinfo->i_strat4096 = 0;
+ if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) {
+ iinfo->i_efe = 1;
+ if (UDF_VERS_USE_EXTENDED_FE > sbi->s_udfrev)
+ sbi->s_udfrev = UDF_VERS_USE_EXTENDED_FE;
+ iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
+ sizeof(struct extendedFileEntry),
+ GFP_KERNEL);
+ } else {
+ iinfo->i_efe = 0;
+ iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
+ sizeof(struct fileEntry),
+ GFP_KERNEL);
+ }
+ if (!iinfo->i_ext.i_data) {
+ iput(inode);
+ *err = -ENOMEM;
+ return NULL;
+ }
block = udf_new_block(dir->i_sb, NULL,
dinfo->i_location.partitionReferenceNum,
@@ -111,6 +124,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
lvhd->uniqueID = cpu_to_le64(uniqueID);
mark_buffer_dirty(sbi->s_lvid_bh);
}
+ mutex_unlock(&sbi->s_alloc_mutex);
inode->i_mode = mode;
inode->i_uid = current->fsuid;
if (dir->i_mode & S_ISGID) {
@@ -129,25 +143,6 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
iinfo->i_lenEAttr = 0;
iinfo->i_lenAlloc = 0;
iinfo->i_use = 0;
- if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) {
- iinfo->i_efe = 1;
- if (UDF_VERS_USE_EXTENDED_FE > sbi->s_udfrev)
- sbi->s_udfrev = UDF_VERS_USE_EXTENDED_FE;
- iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
- sizeof(struct extendedFileEntry),
- GFP_KERNEL);
- } else {
- iinfo->i_efe = 0;
- iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize -
- sizeof(struct fileEntry),
- GFP_KERNEL);
- }
- if (!iinfo->i_ext.i_data) {
- iput(inode);
- *err = -ENOMEM;
- mutex_unlock(&sbi->s_alloc_mutex);
- return NULL;
- }
if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB))
iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
else if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
@@ -158,7 +153,6 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
iinfo->i_crtime = current_fs_time(inode->i_sb);
insert_inode_hash(inode);
mark_inode_dirty(inode);
- mutex_unlock(&sbi->s_alloc_mutex);
if (DQUOT_ALLOC_INODE(inode)) {
DQUOT_DROP(inode);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 7a5f69be6ac..e25e7010627 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -148,7 +148,7 @@ static void udf_destroy_inode(struct inode *inode)
kmem_cache_free(udf_inode_cachep, UDF_I(inode));
}
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
{
struct udf_inode_info *ei = (struct udf_inode_info *)foo;
@@ -369,7 +369,7 @@ enum {
Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore
};
-static match_table_t tokens = {
+static const match_table_t tokens = {
{Opt_novrs, "novrs"},
{Opt_nostrict, "nostrict"},
{Opt_bs, "bs=%u"},
@@ -682,38 +682,26 @@ static int udf_vrs(struct super_block *sb, int silent)
/*
* Check whether there is an anchor block in the given block
*/
-static int udf_check_anchor_block(struct super_block *sb, sector_t block,
- bool varconv)
+static int udf_check_anchor_block(struct super_block *sb, sector_t block)
{
- struct buffer_head *bh = NULL;
- tag *t;
+ struct buffer_head *bh;
uint16_t ident;
- uint32_t location;
- if (varconv) {
- if (udf_fixed_to_variable(block) >=
- sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits)
- return 0;
- bh = sb_bread(sb, udf_fixed_to_variable(block));
- }
- else
- bh = sb_bread(sb, block);
+ if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) &&
+ udf_fixed_to_variable(block) >=
+ sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits)
+ return 0;
+ bh = udf_read_tagged(sb, block, block, &ident);
if (!bh)
return 0;
-
- t = (tag *)bh->b_data;
- ident = le16_to_cpu(t->tagIdent);
- location = le32_to_cpu(t->tagLocation);
brelse(bh);
- if (ident != TAG_IDENT_AVDP)
- return 0;
- return location == block;
+
+ return ident == TAG_IDENT_AVDP;
}
/* Search for an anchor volume descriptor pointer */
-static sector_t udf_scan_anchors(struct super_block *sb, bool varconv,
- sector_t lastblock)
+static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock)
{
sector_t last[6];
int i;
@@ -739,7 +727,7 @@ static sector_t udf_scan_anchors(struct super_block *sb, bool varconv,
sb->s_blocksize_bits)
continue;
- if (udf_check_anchor_block(sb, last[i], varconv)) {
+ if (udf_check_anchor_block(sb, last[i])) {
sbi->s_anchor[0] = last[i];
sbi->s_anchor[1] = last[i] - 256;
return last[i];
@@ -748,17 +736,17 @@ static sector_t udf_scan_anchors(struct super_block *sb, bool varconv,
if (last[i] < 256)
continue;
- if (udf_check_anchor_block(sb, last[i] - 256, varconv)) {
+ if (udf_check_anchor_block(sb, last[i] - 256)) {
sbi->s_anchor[1] = last[i] - 256;
return last[i];
}
}
- if (udf_check_anchor_block(sb, sbi->s_session + 256, varconv)) {
+ if (udf_check_anchor_block(sb, sbi->s_session + 256)) {
sbi->s_anchor[0] = sbi->s_session + 256;
return last[0];
}
- if (udf_check_anchor_block(sb, sbi->s_session + 512, varconv)) {
+ if (udf_check_anchor_block(sb, sbi->s_session + 512)) {
sbi->s_anchor[0] = sbi->s_session + 512;
return last[0];
}
@@ -780,23 +768,24 @@ static void udf_find_anchor(struct super_block *sb)
int i;
struct udf_sb_info *sbi = UDF_SB(sb);
- lastblock = udf_scan_anchors(sb, 0, sbi->s_last_block);
+ lastblock = udf_scan_anchors(sb, sbi->s_last_block);
if (lastblock)
goto check_anchor;
/* No anchor found? Try VARCONV conversion of block numbers */
+ UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
/* Firstly, we try to not convert number of the last block */
- lastblock = udf_scan_anchors(sb, 1,
+ lastblock = udf_scan_anchors(sb,
udf_variable_to_fixed(sbi->s_last_block));
- if (lastblock) {
- UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
+ if (lastblock)
goto check_anchor;
- }
/* Secondly, we try with converted number of the last block */
- lastblock = udf_scan_anchors(sb, 1, sbi->s_last_block);
- if (lastblock)
- UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
+ lastblock = udf_scan_anchors(sb, sbi->s_last_block);
+ if (!lastblock) {
+ /* VARCONV didn't help. Clear it. */
+ UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV);
+ }
check_anchor:
/*
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 85b22b5977f..e65212dfb60 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -76,6 +76,7 @@
#include <linux/errno.h>
#include <linux/fs.h>
+#include <linux/quotaops.h>
#include <linux/slab.h>
#include <linux/time.h>
#include <linux/stat.h>
@@ -308,7 +309,7 @@ enum {
Opt_err
};
-static match_table_t tokens = {
+static const match_table_t tokens = {
{Opt_type_old, "ufstype=old"},
{Opt_type_sunx86, "ufstype=sunx86"},
{Opt_type_sun, "ufstype=sun"},
@@ -1232,7 +1233,7 @@ static int ufs_show_options(struct seq_file *seq, struct vfsmount *vfs)
{
struct ufs_sb_info *sbi = UFS_SB(vfs->mnt_sb);
unsigned mval = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE;
- struct match_token *tp = tokens;
+ const struct match_token *tp = tokens;
while (tp->token != Opt_onerror_panic && tp->token != mval)
++tp;
@@ -1301,7 +1302,7 @@ static void ufs_destroy_inode(struct inode *inode)
kmem_cache_free(ufs_inode_cachep, UFS_I(inode));
}
-static void init_once(struct kmem_cache * cachep, void *foo)
+static void init_once(void *foo)
{
struct ufs_inode_info *ei = (struct ufs_inode_info *) foo;
diff --git a/fs/utimes.c b/fs/utimes.c
index af059d5cb48..6929e3e91d0 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -40,75 +40,30 @@ asmlinkage long sys_utime(char __user *filename, struct utimbuf __user *times)
#endif
-static bool nsec_special(long nsec)
-{
- return nsec == UTIME_OMIT || nsec == UTIME_NOW;
-}
-
static bool nsec_valid(long nsec)
{
- if (nsec_special(nsec))
+ if (nsec == UTIME_OMIT || nsec == UTIME_NOW)
return true;
return nsec >= 0 && nsec <= 999999999;
}
-/* If times==NULL, set access and modification to current time,
- * must be owner or have write permission.
- * Else, update from *times, must be owner or super user.
- */
-long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags)
+static int utimes_common(struct path *path, struct timespec *times)
{
int error;
- struct nameidata nd;
- struct dentry *dentry;
- struct inode *inode;
struct iattr newattrs;
- struct file *f = NULL;
- struct vfsmount *mnt;
-
- error = -EINVAL;
- if (times && (!nsec_valid(times[0].tv_nsec) ||
- !nsec_valid(times[1].tv_nsec))) {
- goto out;
- }
+ struct inode *inode = path->dentry->d_inode;
- if (flags & ~AT_SYMLINK_NOFOLLOW)
+ error = mnt_want_write(path->mnt);
+ if (error)
goto out;
- if (filename == NULL && dfd != AT_FDCWD) {
- error = -EINVAL;
- if (flags & AT_SYMLINK_NOFOLLOW)
- goto out;
-
- error = -EBADF;
- f = fget(dfd);
- if (!f)
- goto out;
- dentry = f->f_path.dentry;
- mnt = f->f_path.mnt;
- } else {
- error = __user_walk_fd(dfd, filename, (flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW, &nd);
- if (error)
- goto out;
-
- dentry = nd.path.dentry;
- mnt = nd.path.mnt;
- }
-
- inode = dentry->d_inode;
+ if (times && times[0].tv_nsec == UTIME_NOW &&
+ times[1].tv_nsec == UTIME_NOW)
+ times = NULL;
- error = mnt_want_write(mnt);
- if (error)
- goto dput_and_out;
-
- /* Don't worry, the checks are done in inode_change_ok() */
newattrs.ia_valid = ATTR_CTIME | ATTR_MTIME | ATTR_ATIME;
if (times) {
- error = -EPERM;
- if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
- goto mnt_drop_write_and_out;
-
if (times[0].tv_nsec == UTIME_OMIT)
newattrs.ia_valid &= ~ATTR_ATIME;
else if (times[0].tv_nsec != UTIME_NOW) {
@@ -124,40 +79,93 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
newattrs.ia_mtime.tv_nsec = times[1].tv_nsec;
newattrs.ia_valid |= ATTR_MTIME_SET;
}
- }
-
- /*
- * If times is NULL or both times are either UTIME_OMIT or
- * UTIME_NOW, then need to check permissions, because
- * inode_change_ok() won't do it.
- */
- if (!times || (nsec_special(times[0].tv_nsec) &&
- nsec_special(times[1].tv_nsec))) {
+ /*
+ * Tell inode_change_ok(), that this is an explicit time
+ * update, even if neither ATTR_ATIME_SET nor ATTR_MTIME_SET
+ * were used.
+ */
+ newattrs.ia_valid |= ATTR_TIMES_SET;
+ } else {
+ /*
+ * If times is NULL (or both times are UTIME_NOW),
+ * then we need to check permissions, because
+ * inode_change_ok() won't do it.
+ */
error = -EACCES;
if (IS_IMMUTABLE(inode))
goto mnt_drop_write_and_out;
if (!is_owner_or_cap(inode)) {
- if (f) {
- if (!(f->f_mode & FMODE_WRITE))
- goto mnt_drop_write_and_out;
- } else {
- error = vfs_permission(&nd, MAY_WRITE);
- if (error)
- goto mnt_drop_write_and_out;
- }
+ error = inode_permission(inode, MAY_WRITE);
+ if (error)
+ goto mnt_drop_write_and_out;
}
}
mutex_lock(&inode->i_mutex);
- error = notify_change(dentry, &newattrs);
+ error = notify_change(path->dentry, &newattrs);
mutex_unlock(&inode->i_mutex);
+
mnt_drop_write_and_out:
- mnt_drop_write(mnt);
-dput_and_out:
- if (f)
- fput(f);
- else
- path_put(&nd.path);
+ mnt_drop_write(path->mnt);
+out:
+ return error;
+}
+
+/*
+ * do_utimes - change times on filename or file descriptor
+ * @dfd: open file descriptor, -1 or AT_FDCWD
+ * @filename: path name or NULL
+ * @times: new times or NULL
+ * @flags: zero or more flags (only AT_SYMLINK_NOFOLLOW for the moment)
+ *
+ * If filename is NULL and dfd refers to an open file, then operate on
+ * the file. Otherwise look up filename, possibly using dfd as a
+ * starting point.
+ *
+ * If times==NULL, set access and modification to current time,
+ * must be owner or have write permission.
+ * Else, update from *times, must be owner or super user.
+ */
+long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags)
+{
+ int error = -EINVAL;
+
+ if (times && (!nsec_valid(times[0].tv_nsec) ||
+ !nsec_valid(times[1].tv_nsec))) {
+ goto out;
+ }
+
+ if (flags & ~AT_SYMLINK_NOFOLLOW)
+ goto out;
+
+ if (filename == NULL && dfd != AT_FDCWD) {
+ struct file *file;
+
+ if (flags & AT_SYMLINK_NOFOLLOW)
+ goto out;
+
+ file = fget(dfd);
+ error = -EBADF;
+ if (!file)
+ goto out;
+
+ error = utimes_common(&file->f_path, times);
+ fput(file);
+ } else {
+ struct path path;
+ int lookup_flags = 0;
+
+ if (!(flags & AT_SYMLINK_NOFOLLOW))
+ lookup_flags |= LOOKUP_FOLLOW;
+
+ error = user_path_at(dfd, filename, lookup_flags, &path);
+ if (error)
+ goto out;
+
+ error = utimes_common(&path, times);
+ path_put(&path);
+ }
+
out:
return error;
}
@@ -169,14 +177,6 @@ asmlinkage long sys_utimensat(int dfd, char __user *filename, struct timespec __
if (utimes) {
if (copy_from_user(&tstimes, utimes, sizeof(tstimes)))
return -EFAULT;
- if ((tstimes[0].tv_nsec == UTIME_OMIT ||
- tstimes[0].tv_nsec == UTIME_NOW) &&
- tstimes[0].tv_sec != 0)
- return -EINVAL;
- if ((tstimes[1].tv_nsec == UTIME_OMIT ||
- tstimes[1].tv_nsec == UTIME_NOW) &&
- tstimes[1].tv_sec != 0)
- return -EINVAL;
/* Nothing to do, we must not even check the path. */
if (tstimes[0].tv_nsec == UTIME_OMIT &&
diff --git a/fs/vfat/namei.c b/fs/vfat/namei.c
index a3522727ea5..155c10b4adb 100644
--- a/fs/vfat/namei.c
+++ b/fs/vfat/namei.c
@@ -621,7 +621,7 @@ shortname:
memcpy(de->name, msdos_name, MSDOS_NAME);
de->attr = is_dir ? ATTR_DIR : ATTR_ARCH;
de->lcase = lcase;
- fat_date_unix2dos(ts->tv_sec, &time, &date);
+ fat_date_unix2dos(ts->tv_sec, &time, &date, sbi->options.tz_utc);
de->time = de->ctime = time;
de->date = de->cdate = de->adate = date;
de->ctime_cs = 0;
@@ -645,7 +645,7 @@ static int vfat_add_entry(struct inode *dir, struct qstr *qname, int is_dir,
if (len == 0)
return -ENOENT;
- slots = kmalloc(sizeof(*slots) * MSDOS_SLOTS, GFP_KERNEL);
+ slots = kmalloc(sizeof(*slots) * MSDOS_SLOTS, GFP_NOFS);
if (slots == NULL)
return -ENOMEM;
@@ -687,7 +687,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
struct dentry *alias;
int err, table;
- lock_kernel();
+ lock_super(sb);
table = (MSDOS_SB(sb)->options.name_check == 's') ? 2 : 0;
dentry->d_op = &vfat_dentry_ops[table];
@@ -699,7 +699,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
brelse(sinfo.bh);
if (IS_ERR(inode)) {
- unlock_kernel();
+ unlock_super(sb);
return ERR_CAST(inode);
}
alias = d_find_alias(inode);
@@ -708,13 +708,13 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
dput(alias);
else {
iput(inode);
- unlock_kernel();
+ unlock_super(sb);
return alias;
}
}
error:
- unlock_kernel();
+ unlock_super(sb);
dentry->d_op = &vfat_dentry_ops[table];
dentry->d_time = dentry->d_parent->d_inode->i_version;
dentry = d_splice_alias(inode, dentry);
@@ -734,7 +734,7 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, int mode,
struct timespec ts;
int err;
- lock_kernel();
+ lock_super(sb);
ts = CURRENT_TIME_SEC;
err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo);
@@ -755,17 +755,18 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, int mode,
dentry->d_time = dentry->d_parent->d_inode->i_version;
d_instantiate(dentry, inode);
out:
- unlock_kernel();
+ unlock_super(sb);
return err;
}
static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
+ struct super_block *sb = dir->i_sb;
struct fat_slot_info sinfo;
int err;
- lock_kernel();
+ lock_super(sb);
err = fat_dir_empty(inode);
if (err)
@@ -783,7 +784,7 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
fat_detach(inode);
out:
- unlock_kernel();
+ unlock_super(sb);
return err;
}
@@ -791,10 +792,11 @@ out:
static int vfat_unlink(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
+ struct super_block *sb = dir->i_sb;
struct fat_slot_info sinfo;
int err;
- lock_kernel();
+ lock_super(sb);
err = vfat_find(dir, &dentry->d_name, &sinfo);
if (err)
@@ -807,7 +809,7 @@ static int vfat_unlink(struct inode *dir, struct dentry *dentry)
inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
fat_detach(inode);
out:
- unlock_kernel();
+ unlock_super(sb);
return err;
}
@@ -820,7 +822,7 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
struct timespec ts;
int err, cluster;
- lock_kernel();
+ lock_super(sb);
ts = CURRENT_TIME_SEC;
cluster = fat_alloc_new_dir(dir, &ts);
@@ -849,13 +851,13 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
dentry->d_time = dentry->d_parent->d_inode->i_version;
d_instantiate(dentry, inode);
- unlock_kernel();
+ unlock_super(sb);
return 0;
out_free:
fat_free_clusters(dir, cluster);
out:
- unlock_kernel();
+ unlock_super(sb);
return err;
}
@@ -869,11 +871,12 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
struct timespec ts;
loff_t dotdot_i_pos, new_i_pos;
int err, is_dir, update_dotdot, corrupt = 0;
+ struct super_block *sb = old_dir->i_sb;
old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
old_inode = old_dentry->d_inode;
new_inode = new_dentry->d_inode;
- lock_kernel();
+ lock_super(sb);
err = vfat_find(old_dir, &old_dentry->d_name, &old_sinfo);
if (err)
goto out;
@@ -951,7 +954,7 @@ out:
brelse(sinfo.bh);
brelse(dotdot_bh);
brelse(old_sinfo.bh);
- unlock_kernel();
+ unlock_super(sb);
return err;
diff --git a/fs/xattr.c b/fs/xattr.c
index 4706a8b1f49..468377e6653 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -63,7 +63,7 @@ xattr_permission(struct inode *inode, const char *name, int mask)
return -EPERM;
}
- return permission(inode, mask, NULL);
+ return inode_permission(inode, mask);
}
int
@@ -252,40 +252,40 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
}
asmlinkage long
-sys_setxattr(const char __user *path, const char __user *name,
+sys_setxattr(const char __user *pathname, const char __user *name,
const void __user *value, size_t size, int flags)
{
- struct nameidata nd;
+ struct path path;
int error;
- error = user_path_walk(path, &nd);
+ error = user_path(pathname, &path);
if (error)
return error;
- error = mnt_want_write(nd.path.mnt);
+ error = mnt_want_write(path.mnt);
if (!error) {
- error = setxattr(nd.path.dentry, name, value, size, flags);
- mnt_drop_write(nd.path.mnt);
+ error = setxattr(path.dentry, name, value, size, flags);
+ mnt_drop_write(path.mnt);
}
- path_put(&nd.path);
+ path_put(&path);
return error;
}
asmlinkage long
-sys_lsetxattr(const char __user *path, const char __user *name,
+sys_lsetxattr(const char __user *pathname, const char __user *name,
const void __user *value, size_t size, int flags)
{
- struct nameidata nd;
+ struct path path;
int error;
- error = user_path_walk_link(path, &nd);
+ error = user_lpath(pathname, &path);
if (error)
return error;
- error = mnt_want_write(nd.path.mnt);
+ error = mnt_want_write(path.mnt);
if (!error) {
- error = setxattr(nd.path.dentry, name, value, size, flags);
- mnt_drop_write(nd.path.mnt);
+ error = setxattr(path.dentry, name, value, size, flags);
+ mnt_drop_write(path.mnt);
}
- path_put(&nd.path);
+ path_put(&path);
return error;
}
@@ -350,32 +350,32 @@ getxattr(struct dentry *d, const char __user *name, void __user *value,
}
asmlinkage ssize_t
-sys_getxattr(const char __user *path, const char __user *name,
+sys_getxattr(const char __user *pathname, const char __user *name,
void __user *value, size_t size)
{
- struct nameidata nd;
+ struct path path;
ssize_t error;
- error = user_path_walk(path, &nd);
+ error = user_path(pathname, &path);
if (error)
return error;
- error = getxattr(nd.path.dentry, name, value, size);
- path_put(&nd.path);
+ error = getxattr(path.dentry, name, value, size);
+ path_put(&path);
return error;
}
asmlinkage ssize_t
-sys_lgetxattr(const char __user *path, const char __user *name, void __user *value,
+sys_lgetxattr(const char __user *pathname, const char __user *name, void __user *value,
size_t size)
{
- struct nameidata nd;
+ struct path path;
ssize_t error;
- error = user_path_walk_link(path, &nd);
+ error = user_lpath(pathname, &path);
if (error)
return error;
- error = getxattr(nd.path.dentry, name, value, size);
- path_put(&nd.path);
+ error = getxattr(path.dentry, name, value, size);
+ path_put(&path);
return error;
}
@@ -425,30 +425,30 @@ listxattr(struct dentry *d, char __user *list, size_t size)
}
asmlinkage ssize_t
-sys_listxattr(const char __user *path, char __user *list, size_t size)
+sys_listxattr(const char __user *pathname, char __user *list, size_t size)
{
- struct nameidata nd;
+ struct path path;
ssize_t error;
- error = user_path_walk(path, &nd);
+ error = user_path(pathname, &path);
if (error)
return error;
- error = listxattr(nd.path.dentry, list, size);
- path_put(&nd.path);
+ error = listxattr(path.dentry, list, size);
+ path_put(&path);
return error;
}
asmlinkage ssize_t
-sys_llistxattr(const char __user *path, char __user *list, size_t size)
+sys_llistxattr(const char __user *pathname, char __user *list, size_t size)
{
- struct nameidata nd;
+ struct path path;
ssize_t error;
- error = user_path_walk_link(path, &nd);
+ error = user_lpath(pathname, &path);
if (error)
return error;
- error = listxattr(nd.path.dentry, list, size);
- path_put(&nd.path);
+ error = listxattr(path.dentry, list, size);
+ path_put(&path);
return error;
}
@@ -486,38 +486,38 @@ removexattr(struct dentry *d, const char __user *name)
}
asmlinkage long
-sys_removexattr(const char __user *path, const char __user *name)
+sys_removexattr(const char __user *pathname, const char __user *name)
{
- struct nameidata nd;
+ struct path path;
int error;
- error = user_path_walk(path, &nd);
+ error = user_path(pathname, &path);
if (error)
return error;
- error = mnt_want_write(nd.path.mnt);
+ error = mnt_want_write(path.mnt);
if (!error) {
- error = removexattr(nd.path.dentry, name);
- mnt_drop_write(nd.path.mnt);
+ error = removexattr(path.dentry, name);
+ mnt_drop_write(path.mnt);
}
- path_put(&nd.path);
+ path_put(&path);
return error;
}
asmlinkage long
-sys_lremovexattr(const char __user *path, const char __user *name)
+sys_lremovexattr(const char __user *pathname, const char __user *name)
{
- struct nameidata nd;
+ struct path path;
int error;
- error = user_path_walk_link(path, &nd);
+ error = user_lpath(pathname, &path);
if (error)
return error;
- error = mnt_want_write(nd.path.mnt);
+ error = mnt_want_write(path.mnt);
if (!error) {
- error = removexattr(nd.path.dentry, name);
- mnt_drop_write(nd.path.mnt);
+ error = removexattr(path.dentry, name);
+ mnt_drop_write(path.mnt);
}
- path_put(&nd.path);
+ path_put(&path);
return error;
}
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 36ec614e699..737c9a42536 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -106,7 +106,8 @@ xfs-y += $(addprefix $(XFS_LINUX)/, \
xfs_iops.o \
xfs_lrw.o \
xfs_super.o \
- xfs_vnode.o)
+ xfs_vnode.o \
+ xfs_xattr.o)
# Objects in support/
xfs-y += $(addprefix support/, \
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index 9b1bb17a050..1cd3b55ee3d 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -90,7 +90,7 @@ kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize,
}
void
-kmem_free(void *ptr, size_t size)
+kmem_free(const void *ptr)
{
if (!is_vmalloc_addr(ptr)) {
kfree(ptr);
@@ -100,7 +100,7 @@ kmem_free(void *ptr, size_t size)
}
void *
-kmem_realloc(void *ptr, size_t newsize, size_t oldsize,
+kmem_realloc(const void *ptr, size_t newsize, size_t oldsize,
unsigned int __nocast flags)
{
void *new;
@@ -110,7 +110,7 @@ kmem_realloc(void *ptr, size_t newsize, size_t oldsize,
if (new)
memcpy(new, ptr,
((oldsize < newsize) ? oldsize : newsize));
- kmem_free(ptr, oldsize);
+ kmem_free(ptr);
}
return new;
}
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
index 5e956490297..af6843c7ee4 100644
--- a/fs/xfs/linux-2.6/kmem.h
+++ b/fs/xfs/linux-2.6/kmem.h
@@ -57,8 +57,8 @@ kmem_flags_convert(unsigned int __nocast flags)
extern void *kmem_alloc(size_t, unsigned int __nocast);
extern void *kmem_zalloc(size_t, unsigned int __nocast);
extern void *kmem_zalloc_greedy(size_t *, size_t, size_t, unsigned int __nocast);
-extern void *kmem_realloc(void *, size_t, size_t, unsigned int __nocast);
-extern void kmem_free(void *, size_t);
+extern void *kmem_realloc(const void *, size_t, size_t, unsigned int __nocast);
+extern void kmem_free(const void *);
/*
* Zone interfaces
@@ -79,7 +79,7 @@ kmem_zone_init(int size, char *zone_name)
static inline kmem_zone_t *
kmem_zone_init_flags(int size, char *zone_name, unsigned long flags,
- void (*construct)(kmem_zone_t *, void *))
+ void (*construct)(void *))
{
return kmem_cache_create(zone_name, size, 0, flags, construct);
}
diff --git a/fs/xfs/linux-2.6/sema.h b/fs/xfs/linux-2.6/sema.h
deleted file mode 100644
index 3abe7e9ceb3..00000000000
--- a/fs/xfs/linux-2.6/sema.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_SUPPORT_SEMA_H__
-#define __XFS_SUPPORT_SEMA_H__
-
-#include <linux/time.h>
-#include <linux/wait.h>
-#include <linux/semaphore.h>
-#include <asm/atomic.h>
-
-/*
- * sema_t structure just maps to struct semaphore in Linux kernel.
- */
-
-typedef struct semaphore sema_t;
-
-#define initnsema(sp, val, name) sema_init(sp, val)
-#define psema(sp, b) down(sp)
-#define vsema(sp) up(sp)
-#define freesema(sema) do { } while (0)
-
-static inline int issemalocked(sema_t *sp)
-{
- return down_trylock(sp) || (up(sp), 0);
-}
-
-/*
- * Map cpsema (try to get the sema) to down_trylock. We need to switch
- * the return values since cpsema returns 1 (acquired) 0 (failed) and
- * down_trylock returns the reverse 0 (acquired) 1 (failed).
- */
-static inline int cpsema(sema_t *sp)
-{
- return down_trylock(sp) ? 0 : 1;
-}
-
-#endif /* __XFS_SUPPORT_SEMA_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index a55c3b26d84..a44d68eb50b 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -73,7 +73,6 @@ xfs_page_trace(
unsigned long pgoff)
{
xfs_inode_t *ip;
- bhv_vnode_t *vp = vn_from_inode(inode);
loff_t isize = i_size_read(inode);
loff_t offset = page_offset(page);
int delalloc = -1, unmapped = -1, unwritten = -1;
@@ -81,7 +80,7 @@ xfs_page_trace(
if (page_has_buffers(page))
xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
- ip = xfs_vtoi(vp);
+ ip = XFS_I(inode);
if (!ip->i_rwtrace)
return;
@@ -409,7 +408,6 @@ xfs_start_buffer_writeback(
STATIC void
xfs_start_page_writeback(
struct page *page,
- struct writeback_control *wbc,
int clear_dirty,
int buffers)
{
@@ -676,7 +674,7 @@ xfs_probe_cluster(
} else
pg_offset = PAGE_CACHE_SIZE;
- if (page->index == tindex && !TestSetPageLocked(page)) {
+ if (page->index == tindex && trylock_page(page)) {
pg_len = xfs_probe_page(page, pg_offset, mapped);
unlock_page(page);
}
@@ -760,7 +758,7 @@ xfs_convert_page(
if (page->index != tindex)
goto fail;
- if (TestSetPageLocked(page))
+ if (!trylock_page(page))
goto fail;
if (PageWriteback(page))
goto fail_unlock_page;
@@ -858,7 +856,7 @@ xfs_convert_page(
done = 1;
}
}
- xfs_start_page_writeback(page, wbc, !page_dirty, count);
+ xfs_start_page_writeback(page, !page_dirty, count);
}
return done;
@@ -1105,7 +1103,7 @@ xfs_page_state_convert(
* that we are writing into for the first time.
*/
type = IOMAP_NEW;
- if (!test_and_set_bit(BH_Lock, &bh->b_state)) {
+ if (trylock_buffer(bh)) {
ASSERT(buffer_mapped(bh));
if (iomap_valid)
all_bh = 1;
@@ -1130,7 +1128,7 @@ xfs_page_state_convert(
SetPageUptodate(page);
if (startio)
- xfs_start_page_writeback(page, wbc, 1, count);
+ xfs_start_page_writeback(page, 1, count);
if (ioend && iomap_valid) {
offset = (iomap.iomap_offset + iomap.iomap_bsize - 1) >>
@@ -1340,6 +1338,10 @@ __xfs_get_blocks(
offset = (xfs_off_t)iblock << inode->i_blkbits;
ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
size = bh_result->b_size;
+
+ if (!create && direct && offset >= i_size_read(inode))
+ return 0;
+
error = xfs_iomap(XFS_I(inode), offset, size,
create ? flags : BMAPI_READ, &iomap, &niomap);
if (error)
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 98e0e86093b..36d5fcd3f59 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -58,7 +58,7 @@ xfs_buf_trace(
bp, id,
(void *)(unsigned long)bp->b_flags,
(void *)(unsigned long)bp->b_hold.counter,
- (void *)(unsigned long)bp->b_sema.count.counter,
+ (void *)(unsigned long)bp->b_sema.count,
(void *)current,
data, ra,
(void *)(unsigned long)((bp->b_file_offset>>32) & 0xffffffff),
@@ -253,7 +253,7 @@ _xfs_buf_initialize(
memset(bp, 0, sizeof(xfs_buf_t));
atomic_set(&bp->b_hold, 1);
- init_MUTEX_LOCKED(&bp->b_iodonesema);
+ init_completion(&bp->b_iowait);
INIT_LIST_HEAD(&bp->b_list);
INIT_LIST_HEAD(&bp->b_hash_list);
init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */
@@ -310,8 +310,7 @@ _xfs_buf_free_pages(
xfs_buf_t *bp)
{
if (bp->b_pages != bp->b_page_array) {
- kmem_free(bp->b_pages,
- bp->b_page_count * sizeof(struct page *));
+ kmem_free(bp->b_pages);
}
}
@@ -839,6 +838,7 @@ xfs_buf_rele(
return;
}
+ ASSERT(atomic_read(&bp->b_hold) > 0);
if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) {
if (bp->b_relse) {
atomic_inc(&bp->b_hold);
@@ -852,11 +852,6 @@ xfs_buf_rele(
spin_unlock(&hash->bh_lock);
xfs_buf_free(bp);
}
- } else {
- /*
- * Catch reference count leaks
- */
- ASSERT(atomic_read(&bp->b_hold) >= 0);
}
}
@@ -1006,12 +1001,13 @@ xfs_buf_iodone_work(
* We can get an EOPNOTSUPP to ordered writes. Here we clear the
* ordered flag and reissue them. Because we can't tell the higher
* layers directly that they should not issue ordered I/O anymore, they
- * need to check if the ordered flag was cleared during I/O completion.
+ * need to check if the _XFS_BARRIER_FAILED flag was set during I/O completion.
*/
if ((bp->b_error == EOPNOTSUPP) &&
(bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) {
XB_TRACE(bp, "ordered_retry", bp->b_iodone);
bp->b_flags &= ~XBF_ORDERED;
+ bp->b_flags |= _XFS_BARRIER_FAILED;
xfs_buf_iorequest(bp);
} else if (bp->b_iodone)
(*(bp->b_iodone))(bp);
@@ -1038,7 +1034,7 @@ xfs_buf_ioend(
xfs_buf_iodone_work(&bp->b_iodone_work);
}
} else {
- up(&bp->b_iodonesema);
+ complete(&bp->b_iowait);
}
}
@@ -1276,7 +1272,7 @@ xfs_buf_iowait(
XB_TRACE(bp, "iowait", 0);
if (atomic_read(&bp->b_io_remaining))
blk_run_address_space(bp->b_target->bt_mapping);
- down(&bp->b_iodonesema);
+ wait_for_completion(&bp->b_iowait);
XB_TRACE(bp, "iowaited", (long)bp->b_error);
return bp->b_error;
}
@@ -1398,7 +1394,7 @@ STATIC void
xfs_free_bufhash(
xfs_buftarg_t *btp)
{
- kmem_free(btp->bt_hash, (1<<btp->bt_hashshift) * sizeof(xfs_bufhash_t));
+ kmem_free(btp->bt_hash);
btp->bt_hash = NULL;
}
@@ -1428,13 +1424,10 @@ xfs_unregister_buftarg(
void
xfs_free_buftarg(
- xfs_buftarg_t *btp,
- int external)
+ xfs_buftarg_t *btp)
{
xfs_flush_buftarg(btp, 1);
xfs_blkdev_issue_flush(btp);
- if (external)
- xfs_blkdev_put(btp->bt_bdev);
xfs_free_bufhash(btp);
iput(btp->bt_mapping->host);
@@ -1444,7 +1437,7 @@ xfs_free_buftarg(
xfs_unregister_buftarg(btp);
kthread_stop(btp->bt_task);
- kmem_free(btp, sizeof(*btp));
+ kmem_free(btp);
}
STATIC int
@@ -1575,7 +1568,7 @@ xfs_alloc_buftarg(
return btp;
error:
- kmem_free(btp, sizeof(*btp));
+ kmem_free(btp);
return NULL;
}
@@ -1803,7 +1796,7 @@ int __init
xfs_buf_init(void)
{
#ifdef XFS_BUF_TRACE
- xfs_buf_trace_buf = ktrace_alloc(XFS_BUF_TRACE_SIZE, KM_SLEEP);
+ xfs_buf_trace_buf = ktrace_alloc(XFS_BUF_TRACE_SIZE, KM_NOFS);
#endif
xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf",
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index f948ec7ba9a..456519a088c 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -85,6 +85,14 @@ typedef enum {
* modifications being lost.
*/
_XBF_PAGE_LOCKED = (1 << 22),
+
+ /*
+ * If we try a barrier write, but it fails we have to communicate
+ * this to the upper layers. Unfortunately b_error gets overwritten
+ * when the buffer is re-issued so we have to add another flag to
+ * keep this information.
+ */
+ _XFS_BARRIER_FAILED = (1 << 23),
} xfs_buf_flags_t;
typedef enum {
@@ -157,7 +165,7 @@ typedef struct xfs_buf {
xfs_buf_iodone_t b_iodone; /* I/O completion function */
xfs_buf_relse_t b_relse; /* releasing function */
xfs_buf_bdstrat_t b_strat; /* pre-write function */
- struct semaphore b_iodonesema; /* Semaphore for I/O waiters */
+ struct completion b_iowait; /* queue for I/O waiters */
void *b_fspriv;
void *b_fspriv2;
void *b_fspriv3;
@@ -352,7 +360,7 @@ extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *);
#define XFS_BUF_CPSEMA(bp) (xfs_buf_cond_lock(bp) == 0)
#define XFS_BUF_VSEMA(bp) xfs_buf_unlock(bp)
#define XFS_BUF_PSEMA(bp,x) xfs_buf_lock(bp)
-#define XFS_BUF_V_IODONESEMA(bp) up(&bp->b_iodonesema);
+#define XFS_BUF_FINISH_IOWAIT(bp) complete(&bp->b_iowait);
#define XFS_BUF_SET_TARGET(bp, target) ((bp)->b_target = (target))
#define XFS_BUF_TARGET(bp) ((bp)->b_target)
@@ -429,7 +437,7 @@ static inline void xfs_bdwrite(void *mp, xfs_buf_t *bp)
* Handling of buftargs.
*/
extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int);
-extern void xfs_free_buftarg(xfs_buftarg_t *, int);
+extern void xfs_free_buftarg(xfs_buftarg_t *);
extern void xfs_wait_buftarg(xfs_buftarg_t *);
extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index c672b3238b1..24fd598af84 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -139,7 +139,7 @@ xfs_nfs_get_inode(
}
xfs_iunlock(ip, XFS_ILOCK_SHARED);
- return ip->i_vnode;
+ return VFS_I(ip);
}
STATIC struct dentry *
@@ -167,7 +167,7 @@ xfs_fs_fh_to_dentry(struct super_block *sb, struct fid *fid,
if (!inode)
return NULL;
if (IS_ERR(inode))
- return ERR_PTR(PTR_ERR(inode));
+ return ERR_CAST(inode);
result = d_alloc_anon(inode);
if (!result) {
iput(inode);
@@ -198,7 +198,7 @@ xfs_fs_fh_to_parent(struct super_block *sb, struct fid *fid,
if (!inode)
return NULL;
if (IS_ERR(inode))
- return ERR_PTR(PTR_ERR(inode));
+ return ERR_CAST(inode);
result = d_alloc_anon(inode);
if (!result) {
iput(inode);
@@ -215,13 +215,13 @@ xfs_fs_get_parent(
struct xfs_inode *cip;
struct dentry *parent;
- error = xfs_lookup(XFS_I(child->d_inode), &xfs_name_dotdot, &cip);
+ error = xfs_lookup(XFS_I(child->d_inode), &xfs_name_dotdot, &cip, NULL);
if (unlikely(error))
return ERR_PTR(-error);
- parent = d_alloc_anon(cip->i_vnode);
+ parent = d_alloc_anon(VFS_I(cip));
if (unlikely(!parent)) {
- iput(cip->i_vnode);
+ iput(VFS_I(cip));
return ERR_PTR(-ENOMEM);
}
return parent;
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 5f60363b934..5311c1acdd4 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -475,6 +475,7 @@ const struct file_operations xfs_invis_file_operations = {
const struct file_operations xfs_dir_file_operations = {
.read = generic_read_dir,
.readdir = xfs_file_readdir,
+ .llseek = generic_file_llseek,
.unlocked_ioctl = xfs_file_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = xfs_file_compat_ioctl,
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 1eefe61f0e1..36caa6d957d 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -31,7 +31,7 @@ xfs_tosspages(
xfs_off_t last,
int fiopt)
{
- struct address_space *mapping = ip->i_vnode->i_mapping;
+ struct address_space *mapping = VFS_I(ip)->i_mapping;
if (mapping->nrpages)
truncate_inode_pages(mapping, first);
@@ -44,7 +44,7 @@ xfs_flushinval_pages(
xfs_off_t last,
int fiopt)
{
- struct address_space *mapping = ip->i_vnode->i_mapping;
+ struct address_space *mapping = VFS_I(ip)->i_mapping;
int ret = 0;
if (mapping->nrpages) {
@@ -64,7 +64,7 @@ xfs_flush_pages(
uint64_t flags,
int fiopt)
{
- struct address_space *mapping = ip->i_vnode->i_mapping;
+ struct address_space *mapping = VFS_I(ip)->i_mapping;
int ret = 0;
int ret2;
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index a42ba9d7115..48799ba7e3e 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -48,6 +48,8 @@
#include "xfs_dfrag.h"
#include "xfs_fsops.h"
#include "xfs_vnodeops.h"
+#include "xfs_quota.h"
+#include "xfs_inode_item.h"
#include <linux/capability.h>
#include <linux/dcache.h>
@@ -84,17 +86,15 @@ xfs_find_handle(
switch (cmd) {
case XFS_IOC_PATH_TO_FSHANDLE:
case XFS_IOC_PATH_TO_HANDLE: {
- struct nameidata nd;
- int error;
-
- error = user_path_walk_link((const char __user *)hreq.path, &nd);
+ struct path path;
+ int error = user_lpath((const char __user *)hreq.path, &path);
if (error)
return error;
- ASSERT(nd.path.dentry);
- ASSERT(nd.path.dentry->d_inode);
- inode = igrab(nd.path.dentry->d_inode);
- path_put(&nd.path);
+ ASSERT(path.dentry);
+ ASSERT(path.dentry->d_inode);
+ inode = igrab(path.dentry->d_inode);
+ path_put(&path);
break;
}
@@ -245,7 +245,7 @@ xfs_vget_fsop_handlereq(
xfs_iunlock(ip, XFS_ILOCK_SHARED);
- *inode = XFS_ITOV(ip);
+ *inode = VFS_I(ip);
return 0;
}
@@ -470,6 +470,12 @@ xfs_attrlist_by_handle(
if (al_hreq.buflen > XATTR_LIST_MAX)
return -XFS_ERROR(EINVAL);
+ /*
+ * Reject flags, only allow namespaces.
+ */
+ if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
+ return -XFS_ERROR(EINVAL);
+
error = xfs_vget_fsop_handlereq(mp, parinode, &al_hreq.hreq, &inode);
if (error)
goto out;
@@ -589,7 +595,7 @@ xfs_attrmulti_by_handle(
goto out;
error = E2BIG;
- size = am_hreq.opcount * sizeof(attr_multiop_t);
+ size = am_hreq.opcount * sizeof(xfs_attr_multiop_t);
if (!size || size > 16 * PAGE_SIZE)
goto out_vn_rele;
@@ -682,9 +688,9 @@ xfs_ioc_space(
return -XFS_ERROR(EFAULT);
if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
- attr_flags |= ATTR_NONBLOCK;
+ attr_flags |= XFS_ATTR_NONBLOCK;
if (ioflags & IO_INVIS)
- attr_flags |= ATTR_DMI;
+ attr_flags |= XFS_ATTR_DMI;
error = xfs_change_file_space(ip, cmd, &bf, filp->f_pos,
NULL, attr_flags);
@@ -875,6 +881,322 @@ xfs_ioc_fsgetxattr(
return 0;
}
+STATIC void
+xfs_set_diflags(
+ struct xfs_inode *ip,
+ unsigned int xflags)
+{
+ unsigned int di_flags;
+
+ /* can't set PREALLOC this way, just preserve it */
+ di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
+ if (xflags & XFS_XFLAG_IMMUTABLE)
+ di_flags |= XFS_DIFLAG_IMMUTABLE;
+ if (xflags & XFS_XFLAG_APPEND)
+ di_flags |= XFS_DIFLAG_APPEND;
+ if (xflags & XFS_XFLAG_SYNC)
+ di_flags |= XFS_DIFLAG_SYNC;
+ if (xflags & XFS_XFLAG_NOATIME)
+ di_flags |= XFS_DIFLAG_NOATIME;
+ if (xflags & XFS_XFLAG_NODUMP)
+ di_flags |= XFS_DIFLAG_NODUMP;
+ if (xflags & XFS_XFLAG_PROJINHERIT)
+ di_flags |= XFS_DIFLAG_PROJINHERIT;
+ if (xflags & XFS_XFLAG_NODEFRAG)
+ di_flags |= XFS_DIFLAG_NODEFRAG;
+ if (xflags & XFS_XFLAG_FILESTREAM)
+ di_flags |= XFS_DIFLAG_FILESTREAM;
+ if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
+ if (xflags & XFS_XFLAG_RTINHERIT)
+ di_flags |= XFS_DIFLAG_RTINHERIT;
+ if (xflags & XFS_XFLAG_NOSYMLINKS)
+ di_flags |= XFS_DIFLAG_NOSYMLINKS;
+ if (xflags & XFS_XFLAG_EXTSZINHERIT)
+ di_flags |= XFS_DIFLAG_EXTSZINHERIT;
+ } else if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
+ if (xflags & XFS_XFLAG_REALTIME)
+ di_flags |= XFS_DIFLAG_REALTIME;
+ if (xflags & XFS_XFLAG_EXTSIZE)
+ di_flags |= XFS_DIFLAG_EXTSIZE;
+ }
+
+ ip->i_d.di_flags = di_flags;
+}
+
+STATIC void
+xfs_diflags_to_linux(
+ struct xfs_inode *ip)
+{
+ struct inode *inode = VFS_I(ip);
+ unsigned int xflags = xfs_ip2xflags(ip);
+
+ if (xflags & XFS_XFLAG_IMMUTABLE)
+ inode->i_flags |= S_IMMUTABLE;
+ else
+ inode->i_flags &= ~S_IMMUTABLE;
+ if (xflags & XFS_XFLAG_APPEND)
+ inode->i_flags |= S_APPEND;
+ else
+ inode->i_flags &= ~S_APPEND;
+ if (xflags & XFS_XFLAG_SYNC)
+ inode->i_flags |= S_SYNC;
+ else
+ inode->i_flags &= ~S_SYNC;
+ if (xflags & XFS_XFLAG_NOATIME)
+ inode->i_flags |= S_NOATIME;
+ else
+ inode->i_flags &= ~S_NOATIME;
+}
+
+#define FSX_PROJID 1
+#define FSX_EXTSIZE 2
+#define FSX_XFLAGS 4
+#define FSX_NONBLOCK 8
+
+STATIC int
+xfs_ioctl_setattr(
+ xfs_inode_t *ip,
+ struct fsxattr *fa,
+ int mask)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ unsigned int lock_flags = 0;
+ struct xfs_dquot *udqp = NULL, *gdqp = NULL;
+ struct xfs_dquot *olddquot = NULL;
+ int code;
+
+ xfs_itrace_entry(ip);
+
+ if (mp->m_flags & XFS_MOUNT_RDONLY)
+ return XFS_ERROR(EROFS);
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
+
+ /*
+ * If disk quotas is on, we make sure that the dquots do exist on disk,
+ * before we start any other transactions. Trying to do this later
+ * is messy. We don't care to take a readlock to look at the ids
+ * in inode here, because we can't hold it across the trans_reserve.
+ * If the IDs do change before we take the ilock, we're covered
+ * because the i_*dquot fields will get updated anyway.
+ */
+ if (XFS_IS_QUOTA_ON(mp) && (mask & FSX_PROJID)) {
+ code = XFS_QM_DQVOPALLOC(mp, ip, ip->i_d.di_uid,
+ ip->i_d.di_gid, fa->fsx_projid,
+ XFS_QMOPT_PQUOTA, &udqp, &gdqp);
+ if (code)
+ return code;
+ }
+
+ /*
+ * For the other attributes, we acquire the inode lock and
+ * first do an error checking pass.
+ */
+ tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
+ code = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
+ if (code)
+ goto error_return;
+
+ lock_flags = XFS_ILOCK_EXCL;
+ xfs_ilock(ip, lock_flags);
+
+ /*
+ * CAP_FOWNER overrides the following restrictions:
+ *
+ * The user ID of the calling process must be equal
+ * to the file owner ID, except in cases where the
+ * CAP_FSETID capability is applicable.
+ */
+ if (current->fsuid != ip->i_d.di_uid && !capable(CAP_FOWNER)) {
+ code = XFS_ERROR(EPERM);
+ goto error_return;
+ }
+
+ /*
+ * Do a quota reservation only if projid is actually going to change.
+ */
+ if (mask & FSX_PROJID) {
+ if (XFS_IS_PQUOTA_ON(mp) &&
+ ip->i_d.di_projid != fa->fsx_projid) {
+ ASSERT(tp);
+ code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp,
+ capable(CAP_FOWNER) ?
+ XFS_QMOPT_FORCE_RES : 0);
+ if (code) /* out of quota */
+ goto error_return;
+ }
+ }
+
+ if (mask & FSX_EXTSIZE) {
+ /*
+ * Can't change extent size if any extents are allocated.
+ */
+ if (ip->i_d.di_nextents &&
+ ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
+ fa->fsx_extsize)) {
+ code = XFS_ERROR(EINVAL); /* EFBIG? */
+ goto error_return;
+ }
+
+ /*
+ * Extent size must be a multiple of the appropriate block
+ * size, if set at all.
+ */
+ if (fa->fsx_extsize != 0) {
+ xfs_extlen_t size;
+
+ if (XFS_IS_REALTIME_INODE(ip) ||
+ ((mask & FSX_XFLAGS) &&
+ (fa->fsx_xflags & XFS_XFLAG_REALTIME))) {
+ size = mp->m_sb.sb_rextsize <<
+ mp->m_sb.sb_blocklog;
+ } else {
+ size = mp->m_sb.sb_blocksize;
+ }
+
+ if (fa->fsx_extsize % size) {
+ code = XFS_ERROR(EINVAL);
+ goto error_return;
+ }
+ }
+ }
+
+
+ if (mask & FSX_XFLAGS) {
+ /*
+ * Can't change realtime flag if any extents are allocated.
+ */
+ if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
+ (XFS_IS_REALTIME_INODE(ip)) !=
+ (fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
+ code = XFS_ERROR(EINVAL); /* EFBIG? */
+ goto error_return;
+ }
+
+ /*
+ * If realtime flag is set then must have realtime data.
+ */
+ if ((fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
+ if ((mp->m_sb.sb_rblocks == 0) ||
+ (mp->m_sb.sb_rextsize == 0) ||
+ (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
+ code = XFS_ERROR(EINVAL);
+ goto error_return;
+ }
+ }
+
+ /*
+ * Can't modify an immutable/append-only file unless
+ * we have appropriate permission.
+ */
+ if ((ip->i_d.di_flags &
+ (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) ||
+ (fa->fsx_xflags &
+ (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
+ !capable(CAP_LINUX_IMMUTABLE)) {
+ code = XFS_ERROR(EPERM);
+ goto error_return;
+ }
+ }
+
+ xfs_trans_ijoin(tp, ip, lock_flags);
+ xfs_trans_ihold(tp, ip);
+
+ /*
+ * Change file ownership. Must be the owner or privileged.
+ * If the system was configured with the "restricted_chown"
+ * option, the owner is not permitted to give away the file,
+ * and can change the group id only to a group of which he
+ * or she is a member.
+ */
+ if (mask & FSX_PROJID) {
+ /*
+ * CAP_FSETID overrides the following restrictions:
+ *
+ * The set-user-ID and set-group-ID bits of a file will be
+ * cleared upon successful return from chown()
+ */
+ if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
+ !capable(CAP_FSETID))
+ ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
+
+ /*
+ * Change the ownerships and register quota modifications
+ * in the transaction.
+ */
+ if (ip->i_d.di_projid != fa->fsx_projid) {
+ if (XFS_IS_PQUOTA_ON(mp)) {
+ olddquot = XFS_QM_DQVOPCHOWN(mp, tp, ip,
+ &ip->i_gdquot, gdqp);
+ }
+ ip->i_d.di_projid = fa->fsx_projid;
+
+ /*
+ * We may have to rev the inode as well as
+ * the superblock version number since projids didn't
+ * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
+ */
+ if (ip->i_d.di_version == XFS_DINODE_VERSION_1)
+ xfs_bump_ino_vers2(tp, ip);
+ }
+
+ }
+
+ if (mask & FSX_EXTSIZE)
+ ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog;
+ if (mask & FSX_XFLAGS) {
+ xfs_set_diflags(ip, fa->fsx_xflags);
+ xfs_diflags_to_linux(ip);
+ }
+
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
+
+ XFS_STATS_INC(xs_ig_attrchg);
+
+ /*
+ * If this is a synchronous mount, make sure that the
+ * transaction goes to disk before returning to the user.
+ * This is slightly sub-optimal in that truncates require
+ * two sync transactions instead of one for wsync filesystems.
+ * One for the truncate and one for the timestamps since we
+ * don't want to change the timestamps unless we're sure the
+ * truncate worked. Truncates are less than 1% of the laddis
+ * mix so this probably isn't worth the trouble to optimize.
+ */
+ if (mp->m_flags & XFS_MOUNT_WSYNC)
+ xfs_trans_set_sync(tp);
+ code = xfs_trans_commit(tp, 0);
+ xfs_iunlock(ip, lock_flags);
+
+ /*
+ * Release any dquot(s) the inode had kept before chown.
+ */
+ XFS_QM_DQRELE(mp, olddquot);
+ XFS_QM_DQRELE(mp, udqp);
+ XFS_QM_DQRELE(mp, gdqp);
+
+ if (code)
+ return code;
+
+ if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE)) {
+ XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, ip, DM_RIGHT_NULL,
+ NULL, DM_RIGHT_NULL, NULL, NULL, 0, 0,
+ (mask & FSX_NONBLOCK) ? DM_FLAGS_NDELAY : 0);
+ }
+
+ return 0;
+
+ error_return:
+ XFS_QM_DQRELE(mp, udqp);
+ XFS_QM_DQRELE(mp, gdqp);
+ xfs_trans_cancel(tp, 0);
+ if (lock_flags)
+ xfs_iunlock(ip, lock_flags);
+ return code;
+}
+
STATIC int
xfs_ioc_fssetxattr(
xfs_inode_t *ip,
@@ -882,31 +1204,16 @@ xfs_ioc_fssetxattr(
void __user *arg)
{
struct fsxattr fa;
- struct bhv_vattr *vattr;
- int error;
- int attr_flags;
+ unsigned int mask;
if (copy_from_user(&fa, arg, sizeof(fa)))
return -EFAULT;
- vattr = kmalloc(sizeof(*vattr), GFP_KERNEL);
- if (unlikely(!vattr))
- return -ENOMEM;
-
- attr_flags = 0;
+ mask = FSX_XFLAGS | FSX_EXTSIZE | FSX_PROJID;
if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
- attr_flags |= ATTR_NONBLOCK;
-
- vattr->va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE | XFS_AT_PROJID;
- vattr->va_xflags = fa.fsx_xflags;
- vattr->va_extsize = fa.fsx_extsize;
- vattr->va_projid = fa.fsx_projid;
+ mask |= FSX_NONBLOCK;
- error = -xfs_setattr(ip, vattr, attr_flags, NULL);
- if (!error)
- vn_revalidate(XFS_ITOV(ip)); /* update flags */
- kfree(vattr);
- return 0;
+ return -xfs_ioctl_setattr(ip, &fa, mask);
}
STATIC int
@@ -928,10 +1235,9 @@ xfs_ioc_setxflags(
struct file *filp,
void __user *arg)
{
- struct bhv_vattr *vattr;
+ struct fsxattr fa;
unsigned int flags;
- int attr_flags;
- int error;
+ unsigned int mask;
if (copy_from_user(&flags, arg, sizeof(flags)))
return -EFAULT;
@@ -941,22 +1247,12 @@ xfs_ioc_setxflags(
FS_SYNC_FL))
return -EOPNOTSUPP;
- vattr = kmalloc(sizeof(*vattr), GFP_KERNEL);
- if (unlikely(!vattr))
- return -ENOMEM;
-
- attr_flags = 0;
+ mask = FSX_XFLAGS;
if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
- attr_flags |= ATTR_NONBLOCK;
-
- vattr->va_mask = XFS_AT_XFLAGS;
- vattr->va_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip));
+ mask |= FSX_NONBLOCK;
+ fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip));
- error = -xfs_setattr(ip, vattr, attr_flags, NULL);
- if (likely(!error))
- vn_revalidate(XFS_ITOV(ip)); /* update flags */
- kfree(vattr);
- return error;
+ return -xfs_ioctl_setattr(ip, &fa, mask);
}
STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 2bf287ef548..095d271f343 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -62,7 +62,7 @@ void
xfs_synchronize_atime(
xfs_inode_t *ip)
{
- struct inode *inode = ip->i_vnode;
+ struct inode *inode = VFS_I(ip);
if (inode) {
ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
@@ -79,7 +79,7 @@ void
xfs_mark_inode_dirty_sync(
xfs_inode_t *ip)
{
- struct inode *inode = ip->i_vnode;
+ struct inode *inode = VFS_I(ip);
if (inode)
mark_inode_dirty_sync(inode);
@@ -89,36 +89,31 @@ xfs_mark_inode_dirty_sync(
* Change the requested timestamp in the given inode.
* We don't lock across timestamp updates, and we don't log them but
* we do record the fact that there is dirty information in core.
- *
- * NOTE -- callers MUST combine XFS_ICHGTIME_MOD or XFS_ICHGTIME_CHG
- * with XFS_ICHGTIME_ACC to be sure that access time
- * update will take. Calling first with XFS_ICHGTIME_ACC
- * and then XFS_ICHGTIME_MOD may fail to modify the access
- * timestamp if the filesystem is mounted noacctm.
*/
void
xfs_ichgtime(
xfs_inode_t *ip,
int flags)
{
- struct inode *inode = vn_to_inode(XFS_ITOV(ip));
+ struct inode *inode = VFS_I(ip);
timespec_t tv;
+ int sync_it = 0;
+
+ tv = current_fs_time(inode->i_sb);
- nanotime(&tv);
- if (flags & XFS_ICHGTIME_MOD) {
+ if ((flags & XFS_ICHGTIME_MOD) &&
+ !timespec_equal(&inode->i_mtime, &tv)) {
inode->i_mtime = tv;
ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
+ sync_it = 1;
}
- if (flags & XFS_ICHGTIME_ACC) {
- inode->i_atime = tv;
- ip->i_d.di_atime.t_sec = (__int32_t)tv.tv_sec;
- ip->i_d.di_atime.t_nsec = (__int32_t)tv.tv_nsec;
- }
- if (flags & XFS_ICHGTIME_CHG) {
+ if ((flags & XFS_ICHGTIME_CHG) &&
+ !timespec_equal(&inode->i_ctime, &tv)) {
inode->i_ctime = tv;
ip->i_d.di_ctime.t_sec = (__int32_t)tv.tv_sec;
ip->i_d.di_ctime.t_nsec = (__int32_t)tv.tv_nsec;
+ sync_it = 1;
}
/*
@@ -130,72 +125,11 @@ xfs_ichgtime(
* ensure that the compiler does not reorder the update
* of i_update_core above the timestamp updates above.
*/
- SYNCHRONIZE();
- ip->i_update_core = 1;
- if (!(inode->i_state & I_NEW))
+ if (sync_it) {
+ SYNCHRONIZE();
+ ip->i_update_core = 1;
mark_inode_dirty_sync(inode);
-}
-
-/*
- * Variant on the above which avoids querying the system clock
- * in situations where we know the Linux inode timestamps have
- * just been updated (and so we can update our inode cheaply).
- */
-void
-xfs_ichgtime_fast(
- xfs_inode_t *ip,
- struct inode *inode,
- int flags)
-{
- timespec_t *tvp;
-
- /*
- * Atime updates for read() & friends are handled lazily now, and
- * explicit updates must go through xfs_ichgtime()
- */
- ASSERT((flags & XFS_ICHGTIME_ACC) == 0);
-
- if (flags & XFS_ICHGTIME_MOD) {
- tvp = &inode->i_mtime;
- ip->i_d.di_mtime.t_sec = (__int32_t)tvp->tv_sec;
- ip->i_d.di_mtime.t_nsec = (__int32_t)tvp->tv_nsec;
- }
- if (flags & XFS_ICHGTIME_CHG) {
- tvp = &inode->i_ctime;
- ip->i_d.di_ctime.t_sec = (__int32_t)tvp->tv_sec;
- ip->i_d.di_ctime.t_nsec = (__int32_t)tvp->tv_nsec;
}
-
- /*
- * We update the i_update_core field _after_ changing
- * the timestamps in order to coordinate properly with
- * xfs_iflush() so that we don't lose timestamp updates.
- * This keeps us from having to hold the inode lock
- * while doing this. We use the SYNCHRONIZE macro to
- * ensure that the compiler does not reorder the update
- * of i_update_core above the timestamp updates above.
- */
- SYNCHRONIZE();
- ip->i_update_core = 1;
- if (!(inode->i_state & I_NEW))
- mark_inode_dirty_sync(inode);
-}
-
-
-/*
- * Pull the link count and size up from the xfs inode to the linux inode
- */
-STATIC void
-xfs_validate_fields(
- struct inode *inode)
-{
- struct xfs_inode *ip = XFS_I(inode);
- loff_t size;
-
- /* we're under i_sem so i_size can't change under us */
- size = XFS_ISIZE(ip);
- if (i_size_read(inode) != size)
- i_size_write(inode, size);
}
/*
@@ -245,8 +179,7 @@ STATIC void
xfs_cleanup_inode(
struct inode *dir,
struct inode *inode,
- struct dentry *dentry,
- int mode)
+ struct dentry *dentry)
{
struct xfs_name teardown;
@@ -257,10 +190,7 @@ xfs_cleanup_inode(
*/
xfs_dentry_to_name(&teardown, dentry);
- if (S_ISDIR(mode))
- xfs_rmdir(XFS_I(dir), &teardown, XFS_I(inode));
- else
- xfs_remove(XFS_I(dir), &teardown, XFS_I(inode));
+ xfs_remove(XFS_I(dir), &teardown, XFS_I(inode));
iput(inode);
}
@@ -275,7 +205,7 @@ xfs_vn_mknod(
struct xfs_inode *ip = NULL;
xfs_acl_t *default_acl = NULL;
struct xfs_name name;
- attrexists_t test_default_acl = _ACL_DEFAULT_EXISTS;
+ int (*test_default_acl)(struct inode *) = _ACL_DEFAULT_EXISTS;
int error;
/*
@@ -320,7 +250,7 @@ xfs_vn_mknod(
if (unlikely(error))
goto out_free_acl;
- inode = ip->i_vnode;
+ inode = VFS_I(ip);
error = xfs_init_security(inode, dir);
if (unlikely(error))
@@ -335,14 +265,11 @@ xfs_vn_mknod(
}
- if (S_ISDIR(mode))
- xfs_validate_fields(inode);
d_instantiate(dentry, inode);
- xfs_validate_fields(dir);
return -error;
out_cleanup_inode:
- xfs_cleanup_inode(dir, inode, dentry, mode);
+ xfs_cleanup_inode(dir, inode, dentry);
out_free_acl:
if (default_acl)
_ACL_FREE(default_acl);
@@ -382,7 +309,7 @@ xfs_vn_lookup(
return ERR_PTR(-ENAMETOOLONG);
xfs_dentry_to_name(&name, dentry);
- error = xfs_lookup(XFS_I(dir), &name, &cip);
+ error = xfs_lookup(XFS_I(dir), &name, &cip, NULL);
if (unlikely(error)) {
if (unlikely(error != ENOENT))
return ERR_PTR(-error);
@@ -390,7 +317,47 @@ xfs_vn_lookup(
return NULL;
}
- return d_splice_alias(cip->i_vnode, dentry);
+ return d_splice_alias(VFS_I(cip), dentry);
+}
+
+STATIC struct dentry *
+xfs_vn_ci_lookup(
+ struct inode *dir,
+ struct dentry *dentry,
+ struct nameidata *nd)
+{
+ struct xfs_inode *ip;
+ struct xfs_name xname;
+ struct xfs_name ci_name;
+ struct qstr dname;
+ int error;
+
+ if (dentry->d_name.len >= MAXNAMELEN)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ xfs_dentry_to_name(&xname, dentry);
+ error = xfs_lookup(XFS_I(dir), &xname, &ip, &ci_name);
+ if (unlikely(error)) {
+ if (unlikely(error != ENOENT))
+ return ERR_PTR(-error);
+ /*
+ * call d_add(dentry, NULL) here when d_drop_negative_children
+ * is called in xfs_vn_mknod (ie. allow negative dentries
+ * with CI filesystems).
+ */
+ return NULL;
+ }
+
+ /* if exact match, just splice and exit */
+ if (!ci_name.name)
+ return d_splice_alias(VFS_I(ip), dentry);
+
+ /* else case-insensitive match... */
+ dname.name = ci_name.name;
+ dname.len = ci_name.len;
+ dentry = d_add_ci(dentry, VFS_I(ip), &dname);
+ kmem_free(ci_name.name);
+ return dentry;
}
STATIC int
@@ -414,7 +381,6 @@ xfs_vn_link(
}
xfs_iflags_set(XFS_I(dir), XFS_IMODIFIED);
- xfs_validate_fields(inode);
d_instantiate(dentry, inode);
return 0;
}
@@ -424,19 +390,23 @@ xfs_vn_unlink(
struct inode *dir,
struct dentry *dentry)
{
- struct inode *inode;
struct xfs_name name;
int error;
- inode = dentry->d_inode;
xfs_dentry_to_name(&name, dentry);
- error = xfs_remove(XFS_I(dir), &name, XFS_I(inode));
- if (likely(!error)) {
- xfs_validate_fields(dir); /* size needs update */
- xfs_validate_fields(inode);
- }
- return -error;
+ error = -xfs_remove(XFS_I(dir), &name, XFS_I(dentry->d_inode));
+ if (error)
+ return error;
+
+ /*
+ * With unlink, the VFS makes the dentry "negative": no inode,
+ * but still hashed. This is incompatible with case-insensitive
+ * mode, so invalidate (unhash) the dentry in CI-mode.
+ */
+ if (xfs_sb_version_hasasciici(&XFS_M(dir->i_sb)->m_sb))
+ d_invalidate(dentry);
+ return 0;
}
STATIC int
@@ -459,43 +429,22 @@ xfs_vn_symlink(
if (unlikely(error))
goto out;
- inode = cip->i_vnode;
+ inode = VFS_I(cip);
error = xfs_init_security(inode, dir);
if (unlikely(error))
goto out_cleanup_inode;
d_instantiate(dentry, inode);
- xfs_validate_fields(dir);
- xfs_validate_fields(inode);
return 0;
out_cleanup_inode:
- xfs_cleanup_inode(dir, inode, dentry, 0);
+ xfs_cleanup_inode(dir, inode, dentry);
out:
return -error;
}
STATIC int
-xfs_vn_rmdir(
- struct inode *dir,
- struct dentry *dentry)
-{
- struct inode *inode = dentry->d_inode;
- struct xfs_name name;
- int error;
-
- xfs_dentry_to_name(&name, dentry);
-
- error = xfs_rmdir(XFS_I(dir), &name, XFS_I(inode));
- if (likely(!error)) {
- xfs_validate_fields(inode);
- xfs_validate_fields(dir);
- }
- return -error;
-}
-
-STATIC int
xfs_vn_rename(
struct inode *odir,
struct dentry *odentry,
@@ -505,22 +454,13 @@ xfs_vn_rename(
struct inode *new_inode = ndentry->d_inode;
struct xfs_name oname;
struct xfs_name nname;
- int error;
xfs_dentry_to_name(&oname, odentry);
xfs_dentry_to_name(&nname, ndentry);
- error = xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode),
+ return -xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode),
XFS_I(ndir), &nname, new_inode ?
XFS_I(new_inode) : NULL);
- if (likely(!error)) {
- if (new_inode)
- xfs_validate_fields(new_inode);
- xfs_validate_fields(odir);
- if (ndir != odir)
- xfs_validate_fields(ndir);
- }
- return -error;
}
/*
@@ -589,8 +529,7 @@ xfs_check_acl(
STATIC int
xfs_vn_permission(
struct inode *inode,
- int mask,
- struct nameidata *nd)
+ int mask)
{
return generic_permission(inode, mask, xfs_check_acl);
}
@@ -660,57 +599,9 @@ xfs_vn_getattr(
STATIC int
xfs_vn_setattr(
struct dentry *dentry,
- struct iattr *attr)
+ struct iattr *iattr)
{
- struct inode *inode = dentry->d_inode;
- unsigned int ia_valid = attr->ia_valid;
- bhv_vattr_t vattr = { 0 };
- int flags = 0;
- int error;
-
- if (ia_valid & ATTR_UID) {
- vattr.va_mask |= XFS_AT_UID;
- vattr.va_uid = attr->ia_uid;
- }
- if (ia_valid & ATTR_GID) {
- vattr.va_mask |= XFS_AT_GID;
- vattr.va_gid = attr->ia_gid;
- }
- if (ia_valid & ATTR_SIZE) {
- vattr.va_mask |= XFS_AT_SIZE;
- vattr.va_size = attr->ia_size;
- }
- if (ia_valid & ATTR_ATIME) {
- vattr.va_mask |= XFS_AT_ATIME;
- vattr.va_atime = attr->ia_atime;
- inode->i_atime = attr->ia_atime;
- }
- if (ia_valid & ATTR_MTIME) {
- vattr.va_mask |= XFS_AT_MTIME;
- vattr.va_mtime = attr->ia_mtime;
- }
- if (ia_valid & ATTR_CTIME) {
- vattr.va_mask |= XFS_AT_CTIME;
- vattr.va_ctime = attr->ia_ctime;
- }
- if (ia_valid & ATTR_MODE) {
- vattr.va_mask |= XFS_AT_MODE;
- vattr.va_mode = attr->ia_mode;
- if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
- inode->i_mode &= ~S_ISGID;
- }
-
- if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET))
- flags |= ATTR_UTIME;
-#ifdef ATTR_NO_BLOCK
- if ((ia_valid & ATTR_NO_BLOCK))
- flags |= ATTR_NONBLOCK;
-#endif
-
- error = xfs_setattr(XFS_I(inode), &vattr, flags, NULL);
- if (likely(!error))
- vn_revalidate(vn_from_inode(inode));
- return -error;
+ return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0, NULL);
}
/*
@@ -728,109 +619,6 @@ xfs_vn_truncate(
WARN_ON(error);
}
-STATIC int
-xfs_vn_setxattr(
- struct dentry *dentry,
- const char *name,
- const void *data,
- size_t size,
- int flags)
-{
- bhv_vnode_t *vp = vn_from_inode(dentry->d_inode);
- char *attr = (char *)name;
- attrnames_t *namesp;
- int xflags = 0;
- int error;
-
- namesp = attr_lookup_namespace(attr, attr_namespaces, ATTR_NAMECOUNT);
- if (!namesp)
- return -EOPNOTSUPP;
- attr += namesp->attr_namelen;
- error = namesp->attr_capable(vp, NULL);
- if (error)
- return error;
-
- /* Convert Linux syscall to XFS internal ATTR flags */
- if (flags & XATTR_CREATE)
- xflags |= ATTR_CREATE;
- if (flags & XATTR_REPLACE)
- xflags |= ATTR_REPLACE;
- xflags |= namesp->attr_flag;
- return namesp->attr_set(vp, attr, (void *)data, size, xflags);
-}
-
-STATIC ssize_t
-xfs_vn_getxattr(
- struct dentry *dentry,
- const char *name,
- void *data,
- size_t size)
-{
- bhv_vnode_t *vp = vn_from_inode(dentry->d_inode);
- char *attr = (char *)name;
- attrnames_t *namesp;
- int xflags = 0;
- ssize_t error;
-
- namesp = attr_lookup_namespace(attr, attr_namespaces, ATTR_NAMECOUNT);
- if (!namesp)
- return -EOPNOTSUPP;
- attr += namesp->attr_namelen;
- error = namesp->attr_capable(vp, NULL);
- if (error)
- return error;
-
- /* Convert Linux syscall to XFS internal ATTR flags */
- if (!size) {
- xflags |= ATTR_KERNOVAL;
- data = NULL;
- }
- xflags |= namesp->attr_flag;
- return namesp->attr_get(vp, attr, (void *)data, size, xflags);
-}
-
-STATIC ssize_t
-xfs_vn_listxattr(
- struct dentry *dentry,
- char *data,
- size_t size)
-{
- bhv_vnode_t *vp = vn_from_inode(dentry->d_inode);
- int error, xflags = ATTR_KERNAMELS;
- ssize_t result;
-
- if (!size)
- xflags |= ATTR_KERNOVAL;
- xflags |= capable(CAP_SYS_ADMIN) ? ATTR_KERNFULLS : ATTR_KERNORMALS;
-
- error = attr_generic_list(vp, data, size, xflags, &result);
- if (error < 0)
- return error;
- return result;
-}
-
-STATIC int
-xfs_vn_removexattr(
- struct dentry *dentry,
- const char *name)
-{
- bhv_vnode_t *vp = vn_from_inode(dentry->d_inode);
- char *attr = (char *)name;
- attrnames_t *namesp;
- int xflags = 0;
- int error;
-
- namesp = attr_lookup_namespace(attr, attr_namespaces, ATTR_NAMECOUNT);
- if (!namesp)
- return -EOPNOTSUPP;
- attr += namesp->attr_namelen;
- error = namesp->attr_capable(vp, NULL);
- if (error)
- return error;
- xflags |= namesp->attr_flag;
- return namesp->attr_remove(vp, attr, xflags);
-}
-
STATIC long
xfs_vn_fallocate(
struct inode *inode,
@@ -854,18 +642,18 @@ xfs_vn_fallocate(
xfs_ilock(ip, XFS_IOLOCK_EXCL);
error = xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
- 0, NULL, ATTR_NOLOCK);
+ 0, NULL, XFS_ATTR_NOLOCK);
if (!error && !(mode & FALLOC_FL_KEEP_SIZE) &&
offset + len > i_size_read(inode))
new_size = offset + len;
/* Change file size if needed */
if (new_size) {
- bhv_vattr_t va;
+ struct iattr iattr;
- va.va_mask = XFS_AT_SIZE;
- va.va_size = new_size;
- error = xfs_setattr(ip, &va, ATTR_NOLOCK, NULL);
+ iattr.ia_valid = ATTR_SIZE;
+ iattr.ia_size = new_size;
+ error = xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK, NULL);
}
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
@@ -873,46 +661,172 @@ out_error:
return error;
}
-const struct inode_operations xfs_inode_operations = {
+static const struct inode_operations xfs_inode_operations = {
.permission = xfs_vn_permission,
.truncate = xfs_vn_truncate,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
- .setxattr = xfs_vn_setxattr,
- .getxattr = xfs_vn_getxattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .removexattr = generic_removexattr,
.listxattr = xfs_vn_listxattr,
- .removexattr = xfs_vn_removexattr,
.fallocate = xfs_vn_fallocate,
};
-const struct inode_operations xfs_dir_inode_operations = {
+static const struct inode_operations xfs_dir_inode_operations = {
.create = xfs_vn_create,
.lookup = xfs_vn_lookup,
.link = xfs_vn_link,
.unlink = xfs_vn_unlink,
.symlink = xfs_vn_symlink,
.mkdir = xfs_vn_mkdir,
- .rmdir = xfs_vn_rmdir,
+ /*
+ * Yes, XFS uses the same method for rmdir and unlink.
+ *
+ * There are some subtile differences deeper in the code,
+ * but we use S_ISDIR to check for those.
+ */
+ .rmdir = xfs_vn_unlink,
.mknod = xfs_vn_mknod,
.rename = xfs_vn_rename,
.permission = xfs_vn_permission,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
- .setxattr = xfs_vn_setxattr,
- .getxattr = xfs_vn_getxattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .removexattr = generic_removexattr,
.listxattr = xfs_vn_listxattr,
- .removexattr = xfs_vn_removexattr,
};
-const struct inode_operations xfs_symlink_inode_operations = {
+static const struct inode_operations xfs_dir_ci_inode_operations = {
+ .create = xfs_vn_create,
+ .lookup = xfs_vn_ci_lookup,
+ .link = xfs_vn_link,
+ .unlink = xfs_vn_unlink,
+ .symlink = xfs_vn_symlink,
+ .mkdir = xfs_vn_mkdir,
+ /*
+ * Yes, XFS uses the same method for rmdir and unlink.
+ *
+ * There are some subtile differences deeper in the code,
+ * but we use S_ISDIR to check for those.
+ */
+ .rmdir = xfs_vn_unlink,
+ .mknod = xfs_vn_mknod,
+ .rename = xfs_vn_rename,
+ .permission = xfs_vn_permission,
+ .getattr = xfs_vn_getattr,
+ .setattr = xfs_vn_setattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .removexattr = generic_removexattr,
+ .listxattr = xfs_vn_listxattr,
+};
+
+static const struct inode_operations xfs_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = xfs_vn_follow_link,
.put_link = xfs_vn_put_link,
.permission = xfs_vn_permission,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
- .setxattr = xfs_vn_setxattr,
- .getxattr = xfs_vn_getxattr,
+ .setxattr = generic_setxattr,
+ .getxattr = generic_getxattr,
+ .removexattr = generic_removexattr,
.listxattr = xfs_vn_listxattr,
- .removexattr = xfs_vn_removexattr,
};
+
+STATIC void
+xfs_diflags_to_iflags(
+ struct inode *inode,
+ struct xfs_inode *ip)
+{
+ if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
+ inode->i_flags |= S_IMMUTABLE;
+ else
+ inode->i_flags &= ~S_IMMUTABLE;
+ if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
+ inode->i_flags |= S_APPEND;
+ else
+ inode->i_flags &= ~S_APPEND;
+ if (ip->i_d.di_flags & XFS_DIFLAG_SYNC)
+ inode->i_flags |= S_SYNC;
+ else
+ inode->i_flags &= ~S_SYNC;
+ if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME)
+ inode->i_flags |= S_NOATIME;
+ else
+ inode->i_flags &= ~S_NOATIME;
+}
+
+/*
+ * Initialize the Linux inode, set up the operation vectors and
+ * unlock the inode.
+ *
+ * When reading existing inodes from disk this is called directly
+ * from xfs_iget, when creating a new inode it is called from
+ * xfs_ialloc after setting up the inode.
+ */
+void
+xfs_setup_inode(
+ struct xfs_inode *ip)
+{
+ struct inode *inode = ip->i_vnode;
+
+ inode->i_mode = ip->i_d.di_mode;
+ inode->i_nlink = ip->i_d.di_nlink;
+ inode->i_uid = ip->i_d.di_uid;
+ inode->i_gid = ip->i_d.di_gid;
+
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFBLK:
+ case S_IFCHR:
+ inode->i_rdev =
+ MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff,
+ sysv_minor(ip->i_df.if_u2.if_rdev));
+ break;
+ default:
+ inode->i_rdev = 0;
+ break;
+ }
+
+ inode->i_generation = ip->i_d.di_gen;
+ i_size_write(inode, ip->i_d.di_size);
+ inode->i_atime.tv_sec = ip->i_d.di_atime.t_sec;
+ inode->i_atime.tv_nsec = ip->i_d.di_atime.t_nsec;
+ inode->i_mtime.tv_sec = ip->i_d.di_mtime.t_sec;
+ inode->i_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
+ inode->i_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
+ inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
+ xfs_diflags_to_iflags(inode, ip);
+ xfs_iflags_clear(ip, XFS_IMODIFIED);
+
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFREG:
+ inode->i_op = &xfs_inode_operations;
+ inode->i_fop = &xfs_file_operations;
+ inode->i_mapping->a_ops = &xfs_address_space_operations;
+ break;
+ case S_IFDIR:
+ if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb))
+ inode->i_op = &xfs_dir_ci_inode_operations;
+ else
+ inode->i_op = &xfs_dir_inode_operations;
+ inode->i_fop = &xfs_dir_file_operations;
+ break;
+ case S_IFLNK:
+ inode->i_op = &xfs_symlink_inode_operations;
+ if (!(ip->i_df.if_flags & XFS_IFINLINE))
+ inode->i_mapping->a_ops = &xfs_address_space_operations;
+ break;
+ default:
+ inode->i_op = &xfs_inode_operations;
+ init_special_inode(inode, inode->i_mode, inode->i_rdev);
+ break;
+ }
+
+ xfs_iflags_clear(ip, XFS_INEW);
+ barrier();
+
+ unlock_new_inode(inode);
+}
diff --git a/fs/xfs/linux-2.6/xfs_iops.h b/fs/xfs/linux-2.6/xfs_iops.h
index 14d0deb7aff..8b1a1e31dc2 100644
--- a/fs/xfs/linux-2.6/xfs_iops.h
+++ b/fs/xfs/linux-2.6/xfs_iops.h
@@ -18,23 +18,14 @@
#ifndef __XFS_IOPS_H__
#define __XFS_IOPS_H__
-extern const struct inode_operations xfs_inode_operations;
-extern const struct inode_operations xfs_dir_inode_operations;
-extern const struct inode_operations xfs_symlink_inode_operations;
+struct xfs_inode;
extern const struct file_operations xfs_file_operations;
extern const struct file_operations xfs_dir_file_operations;
extern const struct file_operations xfs_invis_file_operations;
+extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size);
-struct xfs_inode;
-extern void xfs_ichgtime(struct xfs_inode *, int);
-extern void xfs_ichgtime_fast(struct xfs_inode *, struct inode *, int);
-
-#define xfs_vtoi(vp) \
- ((struct xfs_inode *)vn_to_inode(vp)->i_private)
-
-#define XFS_I(inode) \
- ((struct xfs_inode *)(inode)->i_private)
+extern void xfs_setup_inode(struct xfs_inode *);
#endif /* __XFS_IOPS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 4edc46915b5..cc0f7b3a979 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -45,13 +45,13 @@
#include <mrlock.h>
#include <sv.h>
#include <mutex.h>
-#include <sema.h>
#include <time.h>
#include <support/ktrace.h>
#include <support/debug.h>
#include <support/uuid.h>
+#include <linux/semaphore.h>
#include <linux/mm.h>
#include <linux/kernel.h>
#include <linux/blkdev.h>
@@ -76,6 +76,7 @@
#include <linux/log2.h>
#include <linux/spinlock.h>
#include <linux/random.h>
+#include <linux/ctype.h>
#include <asm/page.h>
#include <asm/div64.h>
@@ -125,8 +126,6 @@
#define current_cpu() (raw_smp_processor_id())
#define current_pid() (current->pid)
-#define current_fsuid(cred) (current->fsuid)
-#define current_fsgid(cred) (current->fsgid)
#define current_test_flags(f) (current->flags & (f))
#define current_set_flags_nested(sp, f) \
(*(sp) = current->flags, current->flags |= (f))
@@ -179,7 +178,7 @@
#define xfs_sort(a,n,s,fn) sort(a,n,s,fn,NULL)
#define xfs_stack_trace() dump_stack()
#define xfs_itruncate_data(ip, off) \
- (-vmtruncate(vn_to_inode(XFS_ITOV(ip)), (off)))
+ (-vmtruncate(VFS_I(ip), (off)))
/* Move the kernel do_div definition off to one side */
@@ -299,4 +298,11 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
return x;
}
+/* ARM old ABI has some weird alignment/padding */
+#if defined(__arm__) && !defined(__ARM_EABI__)
+#define __arch_pack __attribute__((packed))
+#else
+#define __arch_pack
+#endif
+
#endif /* __XFS_LINUX__ */
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 5e3b57516ec..1957e5357d0 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -137,7 +137,7 @@ xfs_iozero(
struct address_space *mapping;
int status;
- mapping = ip->i_vnode->i_mapping;
+ mapping = VFS_I(ip)->i_mapping;
do {
unsigned offset, bytes;
void *fsdata;
@@ -674,9 +674,7 @@ start:
*/
if (likely(!(ioflags & IO_INVIS) &&
!mnt_want_write(file->f_path.mnt))) {
- file_update_time(file);
- xfs_ichgtime_fast(xip, inode,
- XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
mnt_drop_write(file->f_path.mnt);
}
@@ -711,7 +709,7 @@ start:
!capable(CAP_FSETID)) {
error = xfs_write_clear_setuid(xip);
if (likely(!error))
- error = -remove_suid(file->f_path.dentry);
+ error = -file_remove_suid(file);
if (unlikely(error)) {
goto out_unlock_internal;
}
diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c
index e480b610205..3d5b67c075c 100644
--- a/fs/xfs/linux-2.6/xfs_stats.c
+++ b/fs/xfs/linux-2.6/xfs_stats.c
@@ -98,12 +98,21 @@ xfs_read_xfsstats(
return len;
}
-void
+int
xfs_init_procfs(void)
{
if (!proc_mkdir("fs/xfs", NULL))
- return;
- create_proc_read_entry("fs/xfs/stat", 0, NULL, xfs_read_xfsstats, NULL);
+ goto out;
+
+ if (!create_proc_read_entry("fs/xfs/stat", 0, NULL,
+ xfs_read_xfsstats, NULL))
+ goto out_remove_entry;
+ return 0;
+
+ out_remove_entry:
+ remove_proc_entry("fs/xfs", NULL);
+ out:
+ return -ENOMEM;
}
void
diff --git a/fs/xfs/linux-2.6/xfs_stats.h b/fs/xfs/linux-2.6/xfs_stats.h
index afd0b0d5fdb..e83820febc9 100644
--- a/fs/xfs/linux-2.6/xfs_stats.h
+++ b/fs/xfs/linux-2.6/xfs_stats.h
@@ -134,7 +134,7 @@ DECLARE_PER_CPU(struct xfsstats, xfsstats);
#define XFS_STATS_DEC(v) (per_cpu(xfsstats, current_cpu()).v--)
#define XFS_STATS_ADD(v, inc) (per_cpu(xfsstats, current_cpu()).v += (inc))
-extern void xfs_init_procfs(void);
+extern int xfs_init_procfs(void);
extern void xfs_cleanup_procfs(void);
@@ -144,8 +144,14 @@ extern void xfs_cleanup_procfs(void);
# define XFS_STATS_DEC(count)
# define XFS_STATS_ADD(count, inc)
-static inline void xfs_init_procfs(void) { };
-static inline void xfs_cleanup_procfs(void) { };
+static inline int xfs_init_procfs(void)
+{
+ return 0;
+}
+
+static inline void xfs_cleanup_procfs(void)
+{
+}
#endif /* !CONFIG_PROC_FS */
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 742b2c7852c..7227b2efef2 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -52,6 +52,12 @@
#include "xfs_version.h"
#include "xfs_log_priv.h"
#include "xfs_trans_priv.h"
+#include "xfs_filestream.h"
+#include "xfs_da_btree.h"
+#include "xfs_dir2_trace.h"
+#include "xfs_extfree_item.h"
+#include "xfs_mru_cache.h"
+#include "xfs_inode_item.h"
#include <linux/namei.h>
#include <linux/init.h>
@@ -60,6 +66,7 @@
#include <linux/writeback.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
+#include <linux/parser.h>
static struct quotactl_ops xfs_quotactl_operations;
static struct super_operations xfs_super_operations;
@@ -74,7 +81,10 @@ xfs_args_allocate(
{
struct xfs_mount_args *args;
- args = kmem_zalloc(sizeof(struct xfs_mount_args), KM_SLEEP);
+ args = kzalloc(sizeof(struct xfs_mount_args), GFP_KERNEL);
+ if (!args)
+ return NULL;
+
args->logbufs = args->logbufsize = -1;
strncpy(args->fsname, sb->s_id, MAXNAMELEN);
@@ -138,6 +148,23 @@ xfs_args_allocate(
#define MNTOPT_XDSM "xdsm" /* DMI enabled (DMAPI / XDSM) */
#define MNTOPT_DMI "dmi" /* DMI enabled (DMAPI / XDSM) */
+/*
+ * Table driven mount option parser.
+ *
+ * Currently only used for remount, but it will be used for mount
+ * in the future, too.
+ */
+enum {
+ Opt_barrier, Opt_nobarrier, Opt_err
+};
+
+static const match_table_t tokens = {
+ {Opt_barrier, "barrier"},
+ {Opt_nobarrier, "nobarrier"},
+ {Opt_err, NULL}
+};
+
+
STATIC unsigned long
suffix_strtoul(char *s, char **endp, unsigned int base)
{
@@ -314,6 +341,7 @@ xfs_parseargs(
args->flags |= XFSMNT_ATTR2;
} else if (!strcmp(this_char, MNTOPT_NOATTR2)) {
args->flags &= ~XFSMNT_ATTR2;
+ args->flags |= XFSMNT_NOATTR2;
} else if (!strcmp(this_char, MNTOPT_FILESTREAM)) {
args->flags2 |= XFSMNT2_FILESTREAMS;
} else if (!strcmp(this_char, MNTOPT_NOQUOTA)) {
@@ -553,115 +581,6 @@ xfs_max_file_offset(
return (((__uint64_t)pagefactor) << bitshift) - 1;
}
-STATIC_INLINE void
-xfs_set_inodeops(
- struct inode *inode)
-{
- switch (inode->i_mode & S_IFMT) {
- case S_IFREG:
- inode->i_op = &xfs_inode_operations;
- inode->i_fop = &xfs_file_operations;
- inode->i_mapping->a_ops = &xfs_address_space_operations;
- break;
- case S_IFDIR:
- inode->i_op = &xfs_dir_inode_operations;
- inode->i_fop = &xfs_dir_file_operations;
- break;
- case S_IFLNK:
- inode->i_op = &xfs_symlink_inode_operations;
- if (!(XFS_I(inode)->i_df.if_flags & XFS_IFINLINE))
- inode->i_mapping->a_ops = &xfs_address_space_operations;
- break;
- default:
- inode->i_op = &xfs_inode_operations;
- init_special_inode(inode, inode->i_mode, inode->i_rdev);
- break;
- }
-}
-
-STATIC_INLINE void
-xfs_revalidate_inode(
- xfs_mount_t *mp,
- bhv_vnode_t *vp,
- xfs_inode_t *ip)
-{
- struct inode *inode = vn_to_inode(vp);
-
- inode->i_mode = ip->i_d.di_mode;
- inode->i_nlink = ip->i_d.di_nlink;
- inode->i_uid = ip->i_d.di_uid;
- inode->i_gid = ip->i_d.di_gid;
-
- switch (inode->i_mode & S_IFMT) {
- case S_IFBLK:
- case S_IFCHR:
- inode->i_rdev =
- MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff,
- sysv_minor(ip->i_df.if_u2.if_rdev));
- break;
- default:
- inode->i_rdev = 0;
- break;
- }
-
- inode->i_generation = ip->i_d.di_gen;
- i_size_write(inode, ip->i_d.di_size);
- inode->i_atime.tv_sec = ip->i_d.di_atime.t_sec;
- inode->i_atime.tv_nsec = ip->i_d.di_atime.t_nsec;
- inode->i_mtime.tv_sec = ip->i_d.di_mtime.t_sec;
- inode->i_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
- inode->i_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
- inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
- if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
- inode->i_flags |= S_IMMUTABLE;
- else
- inode->i_flags &= ~S_IMMUTABLE;
- if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
- inode->i_flags |= S_APPEND;
- else
- inode->i_flags &= ~S_APPEND;
- if (ip->i_d.di_flags & XFS_DIFLAG_SYNC)
- inode->i_flags |= S_SYNC;
- else
- inode->i_flags &= ~S_SYNC;
- if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME)
- inode->i_flags |= S_NOATIME;
- else
- inode->i_flags &= ~S_NOATIME;
- xfs_iflags_clear(ip, XFS_IMODIFIED);
-}
-
-void
-xfs_initialize_vnode(
- struct xfs_mount *mp,
- bhv_vnode_t *vp,
- struct xfs_inode *ip)
-{
- struct inode *inode = vn_to_inode(vp);
-
- if (!ip->i_vnode) {
- ip->i_vnode = vp;
- inode->i_private = ip;
- }
-
- /*
- * We need to set the ops vectors, and unlock the inode, but if
- * we have been called during the new inode create process, it is
- * too early to fill in the Linux inode. We will get called a
- * second time once the inode is properly set up, and then we can
- * finish our work.
- */
- if (ip->i_d.di_mode != 0 && (inode->i_state & I_NEW)) {
- xfs_revalidate_inode(mp, vp, ip);
- xfs_set_inodeops(inode);
-
- xfs_iflags_clear(ip, XFS_INEW);
- barrier();
-
- unlock_new_inode(inode);
- }
-}
-
int
xfs_blkdev_get(
xfs_mount_t *mp,
@@ -733,14 +652,6 @@ xfs_mountfs_check_barriers(xfs_mount_t *mp)
return;
}
- if (mp->m_ddev_targp->bt_bdev->bd_disk->queue->ordered ==
- QUEUE_ORDERED_NONE) {
- xfs_fs_cmn_err(CE_NOTE, mp,
- "Disabling barriers, not supported by the underlying device");
- mp->m_flags &= ~XFS_MOUNT_BARRIER;
- return;
- }
-
if (xfs_readonly_buftarg(mp->m_ddev_targp)) {
xfs_fs_cmn_err(CE_NOTE, mp,
"Disabling barriers, underlying device is readonly");
@@ -764,6 +675,139 @@ xfs_blkdev_issue_flush(
blkdev_issue_flush(buftarg->bt_bdev, NULL);
}
+STATIC void
+xfs_close_devices(
+ struct xfs_mount *mp)
+{
+ if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) {
+ struct block_device *logdev = mp->m_logdev_targp->bt_bdev;
+ xfs_free_buftarg(mp->m_logdev_targp);
+ xfs_blkdev_put(logdev);
+ }
+ if (mp->m_rtdev_targp) {
+ struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev;
+ xfs_free_buftarg(mp->m_rtdev_targp);
+ xfs_blkdev_put(rtdev);
+ }
+ xfs_free_buftarg(mp->m_ddev_targp);
+}
+
+/*
+ * The file system configurations are:
+ * (1) device (partition) with data and internal log
+ * (2) logical volume with data and log subvolumes.
+ * (3) logical volume with data, log, and realtime subvolumes.
+ *
+ * We only have to handle opening the log and realtime volumes here if
+ * they are present. The data subvolume has already been opened by
+ * get_sb_bdev() and is stored in sb->s_bdev.
+ */
+STATIC int
+xfs_open_devices(
+ struct xfs_mount *mp,
+ struct xfs_mount_args *args)
+{
+ struct block_device *ddev = mp->m_super->s_bdev;
+ struct block_device *logdev = NULL, *rtdev = NULL;
+ int error;
+
+ /*
+ * Open real time and log devices - order is important.
+ */
+ if (args->logname[0]) {
+ error = xfs_blkdev_get(mp, args->logname, &logdev);
+ if (error)
+ goto out;
+ }
+
+ if (args->rtname[0]) {
+ error = xfs_blkdev_get(mp, args->rtname, &rtdev);
+ if (error)
+ goto out_close_logdev;
+
+ if (rtdev == ddev || rtdev == logdev) {
+ cmn_err(CE_WARN,
+ "XFS: Cannot mount filesystem with identical rtdev and ddev/logdev.");
+ error = EINVAL;
+ goto out_close_rtdev;
+ }
+ }
+
+ /*
+ * Setup xfs_mount buffer target pointers
+ */
+ error = ENOMEM;
+ mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0);
+ if (!mp->m_ddev_targp)
+ goto out_close_rtdev;
+
+ if (rtdev) {
+ mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1);
+ if (!mp->m_rtdev_targp)
+ goto out_free_ddev_targ;
+ }
+
+ if (logdev && logdev != ddev) {
+ mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1);
+ if (!mp->m_logdev_targp)
+ goto out_free_rtdev_targ;
+ } else {
+ mp->m_logdev_targp = mp->m_ddev_targp;
+ }
+
+ return 0;
+
+ out_free_rtdev_targ:
+ if (mp->m_rtdev_targp)
+ xfs_free_buftarg(mp->m_rtdev_targp);
+ out_free_ddev_targ:
+ xfs_free_buftarg(mp->m_ddev_targp);
+ out_close_rtdev:
+ if (rtdev)
+ xfs_blkdev_put(rtdev);
+ out_close_logdev:
+ if (logdev && logdev != ddev)
+ xfs_blkdev_put(logdev);
+ out:
+ return error;
+}
+
+/*
+ * Setup xfs_mount buffer target pointers based on superblock
+ */
+STATIC int
+xfs_setup_devices(
+ struct xfs_mount *mp)
+{
+ int error;
+
+ error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_blocksize,
+ mp->m_sb.sb_sectsize);
+ if (error)
+ return error;
+
+ if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) {
+ unsigned int log_sector_size = BBSIZE;
+
+ if (xfs_sb_version_hassector(&mp->m_sb))
+ log_sector_size = mp->m_sb.sb_logsectsize;
+ error = xfs_setsize_buftarg(mp->m_logdev_targp,
+ mp->m_sb.sb_blocksize,
+ log_sector_size);
+ if (error)
+ return error;
+ }
+ if (mp->m_rtdev_targp) {
+ error = xfs_setsize_buftarg(mp->m_rtdev_targp,
+ mp->m_sb.sb_blocksize,
+ mp->m_sb.sb_sectsize);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
/*
* XFS AIL push thread support
*/
@@ -826,63 +870,21 @@ STATIC struct inode *
xfs_fs_alloc_inode(
struct super_block *sb)
{
- bhv_vnode_t *vp;
-
- vp = kmem_zone_alloc(xfs_vnode_zone, KM_SLEEP);
- if (unlikely(!vp))
- return NULL;
- return vn_to_inode(vp);
+ return kmem_zone_alloc(xfs_vnode_zone, KM_SLEEP);
}
STATIC void
xfs_fs_destroy_inode(
struct inode *inode)
{
- kmem_zone_free(xfs_vnode_zone, vn_from_inode(inode));
+ kmem_zone_free(xfs_vnode_zone, inode);
}
STATIC void
xfs_fs_inode_init_once(
- kmem_zone_t *zonep,
void *vnode)
{
- inode_init_once(vn_to_inode((bhv_vnode_t *)vnode));
-}
-
-STATIC int __init
-xfs_init_zones(void)
-{
- xfs_vnode_zone = kmem_zone_init_flags(sizeof(bhv_vnode_t), "xfs_vnode",
- KM_ZONE_HWALIGN | KM_ZONE_RECLAIM |
- KM_ZONE_SPREAD,
- xfs_fs_inode_init_once);
- if (!xfs_vnode_zone)
- goto out;
-
- xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend");
- if (!xfs_ioend_zone)
- goto out_destroy_vnode_zone;
-
- xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE,
- xfs_ioend_zone);
- if (!xfs_ioend_pool)
- goto out_free_ioend_zone;
- return 0;
-
- out_free_ioend_zone:
- kmem_zone_destroy(xfs_ioend_zone);
- out_destroy_vnode_zone:
- kmem_zone_destroy(xfs_vnode_zone);
- out:
- return -ENOMEM;
-}
-
-STATIC void
-xfs_destroy_zones(void)
-{
- mempool_destroy(xfs_ioend_pool);
- kmem_zone_destroy(xfs_vnode_zone);
- kmem_zone_destroy(xfs_ioend_zone);
+ inode_init_once((struct inode *)vnode);
}
/*
@@ -987,7 +989,7 @@ void
xfs_flush_inode(
xfs_inode_t *ip)
{
- struct inode *inode = ip->i_vnode;
+ struct inode *inode = VFS_I(ip);
igrab(inode);
xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work);
@@ -1012,7 +1014,7 @@ void
xfs_flush_device(
xfs_inode_t *ip)
{
- struct inode *inode = vn_to_inode(XFS_ITOV(ip));
+ struct inode *inode = VFS_I(ip);
igrab(inode);
xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work);
@@ -1074,7 +1076,7 @@ xfssyncd(
list_del(&work->w_list);
if (work == &mp->m_sync_work)
continue;
- kmem_free(work, sizeof(struct bhv_vfs_sync_work));
+ kmem_free(work);
}
}
@@ -1082,18 +1084,76 @@ xfssyncd(
}
STATIC void
+xfs_free_fsname(
+ struct xfs_mount *mp)
+{
+ kfree(mp->m_fsname);
+ kfree(mp->m_rtname);
+ kfree(mp->m_logname);
+}
+
+STATIC void
xfs_fs_put_super(
struct super_block *sb)
{
struct xfs_mount *mp = XFS_M(sb);
+ struct xfs_inode *rip = mp->m_rootip;
+ int unmount_event_flags = 0;
int error;
kthread_stop(mp->m_sync_task);
xfs_sync(mp, SYNC_ATTR | SYNC_DELWRI);
- error = xfs_unmount(mp, 0, NULL);
- if (error)
- printk("XFS: unmount got error=%d\n", error);
+
+#ifdef HAVE_DMAPI
+ if (mp->m_flags & XFS_MOUNT_DMAPI) {
+ unmount_event_flags =
+ (mp->m_dmevmask & (1 << DM_EVENT_UNMOUNT)) ?
+ 0 : DM_FLAGS_UNWANTED;
+ /*
+ * Ignore error from dmapi here, first unmount is not allowed
+ * to fail anyway, and second we wouldn't want to fail a
+ * unmount because of dmapi.
+ */
+ XFS_SEND_PREUNMOUNT(mp, rip, DM_RIGHT_NULL, rip, DM_RIGHT_NULL,
+ NULL, NULL, 0, 0, unmount_event_flags);
+ }
+#endif
+
+ /*
+ * Blow away any referenced inode in the filestreams cache.
+ * This can and will cause log traffic as inodes go inactive
+ * here.
+ */
+ xfs_filestream_unmount(mp);
+
+ XFS_bflush(mp->m_ddev_targp);
+ error = xfs_unmount_flush(mp, 0);
+ WARN_ON(error);
+
+ /*
+ * If we're forcing a shutdown, typically because of a media error,
+ * we want to make sure we invalidate dirty pages that belong to
+ * referenced vnodes as well.
+ */
+ if (XFS_FORCED_SHUTDOWN(mp)) {
+ error = xfs_sync(mp, SYNC_WAIT | SYNC_CLOSE);
+ ASSERT(error != EFSCORRUPTED);
+ }
+
+ if (mp->m_flags & XFS_MOUNT_DMAPI) {
+ XFS_SEND_UNMOUNT(mp, rip, DM_RIGHT_NULL, 0, 0,
+ unmount_event_flags);
+ }
+
+ xfs_unmountfs(mp);
+ xfs_freesb(mp);
+ xfs_icsb_destroy_counters(mp);
+ xfs_close_devices(mp);
+ xfs_qmops_put(mp);
+ xfs_dmops_put(mp);
+ xfs_free_fsname(mp);
+ kfree(mp);
}
STATIC void
@@ -1216,14 +1276,74 @@ xfs_fs_remount(
char *options)
{
struct xfs_mount *mp = XFS_M(sb);
- struct xfs_mount_args *args = xfs_args_allocate(sb, 0);
- int error;
+ substring_t args[MAX_OPT_ARGS];
+ char *p;
- error = xfs_parseargs(mp, options, args, 1);
- if (!error)
- error = xfs_mntupdate(mp, flags, args);
- kmem_free(args, sizeof(*args));
- return -error;
+ while ((p = strsep(&options, ",")) != NULL) {
+ int token;
+
+ if (!*p)
+ continue;
+
+ token = match_token(p, tokens, args);
+ switch (token) {
+ case Opt_barrier:
+ mp->m_flags |= XFS_MOUNT_BARRIER;
+
+ /*
+ * Test if barriers are actually working if we can,
+ * else delay this check until the filesystem is
+ * marked writeable.
+ */
+ if (!(mp->m_flags & XFS_MOUNT_RDONLY))
+ xfs_mountfs_check_barriers(mp);
+ break;
+ case Opt_nobarrier:
+ mp->m_flags &= ~XFS_MOUNT_BARRIER;
+ break;
+ default:
+ /*
+ * Logically we would return an error here to prevent
+ * users from believing they might have changed
+ * mount options using remount which can't be changed.
+ *
+ * But unfortunately mount(8) adds all options from
+ * mtab and fstab to the mount arguments in some cases
+ * so we can't blindly reject options, but have to
+ * check for each specified option if it actually
+ * differs from the currently set option and only
+ * reject it if that's the case.
+ *
+ * Until that is implemented we return success for
+ * every remount request, and silently ignore all
+ * options that we can't actually change.
+ */
+#if 0
+ printk(KERN_INFO
+ "XFS: mount option \"%s\" not supported for remount\n", p);
+ return -EINVAL;
+#else
+ return 0;
+#endif
+ }
+ }
+
+ /* rw/ro -> rw */
+ if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) {
+ mp->m_flags &= ~XFS_MOUNT_RDONLY;
+ if (mp->m_flags & XFS_MOUNT_BARRIER)
+ xfs_mountfs_check_barriers(mp);
+ }
+
+ /* rw -> ro */
+ if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
+ xfs_filestream_flush(mp);
+ xfs_sync(mp, SYNC_DATA_QUIESCE);
+ xfs_attr_quiesce(mp);
+ mp->m_flags |= XFS_MOUNT_RDONLY;
+ }
+
+ return 0;
}
/*
@@ -1300,6 +1420,245 @@ xfs_fs_setxquota(
Q_XSETPQLIM), id, (caddr_t)fdq);
}
+/*
+ * This function fills in xfs_mount_t fields based on mount args.
+ * Note: the superblock has _not_ yet been read in.
+ */
+STATIC int
+xfs_start_flags(
+ struct xfs_mount_args *ap,
+ struct xfs_mount *mp)
+{
+ int error;
+
+ /* Values are in BBs */
+ if ((ap->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) {
+ /*
+ * At this point the superblock has not been read
+ * in, therefore we do not know the block size.
+ * Before the mount call ends we will convert
+ * these to FSBs.
+ */
+ mp->m_dalign = ap->sunit;
+ mp->m_swidth = ap->swidth;
+ }
+
+ if (ap->logbufs != -1 &&
+ ap->logbufs != 0 &&
+ (ap->logbufs < XLOG_MIN_ICLOGS ||
+ ap->logbufs > XLOG_MAX_ICLOGS)) {
+ cmn_err(CE_WARN,
+ "XFS: invalid logbufs value: %d [not %d-%d]",
+ ap->logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
+ return XFS_ERROR(EINVAL);
+ }
+ mp->m_logbufs = ap->logbufs;
+ if (ap->logbufsize != -1 &&
+ ap->logbufsize != 0 &&
+ (ap->logbufsize < XLOG_MIN_RECORD_BSIZE ||
+ ap->logbufsize > XLOG_MAX_RECORD_BSIZE ||
+ !is_power_of_2(ap->logbufsize))) {
+ cmn_err(CE_WARN,
+ "XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
+ ap->logbufsize);
+ return XFS_ERROR(EINVAL);
+ }
+
+ error = ENOMEM;
+
+ mp->m_logbsize = ap->logbufsize;
+ mp->m_fsname_len = strlen(ap->fsname) + 1;
+
+ mp->m_fsname = kstrdup(ap->fsname, GFP_KERNEL);
+ if (!mp->m_fsname)
+ goto out;
+
+ if (ap->rtname[0]) {
+ mp->m_rtname = kstrdup(ap->rtname, GFP_KERNEL);
+ if (!mp->m_rtname)
+ goto out_free_fsname;
+
+ }
+
+ if (ap->logname[0]) {
+ mp->m_logname = kstrdup(ap->logname, GFP_KERNEL);
+ if (!mp->m_logname)
+ goto out_free_rtname;
+ }
+
+ if (ap->flags & XFSMNT_WSYNC)
+ mp->m_flags |= XFS_MOUNT_WSYNC;
+#if XFS_BIG_INUMS
+ if (ap->flags & XFSMNT_INO64) {
+ mp->m_flags |= XFS_MOUNT_INO64;
+ mp->m_inoadd = XFS_INO64_OFFSET;
+ }
+#endif
+ if (ap->flags & XFSMNT_RETERR)
+ mp->m_flags |= XFS_MOUNT_RETERR;
+ if (ap->flags & XFSMNT_NOALIGN)
+ mp->m_flags |= XFS_MOUNT_NOALIGN;
+ if (ap->flags & XFSMNT_SWALLOC)
+ mp->m_flags |= XFS_MOUNT_SWALLOC;
+ if (ap->flags & XFSMNT_OSYNCISOSYNC)
+ mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC;
+ if (ap->flags & XFSMNT_32BITINODES)
+ mp->m_flags |= XFS_MOUNT_32BITINODES;
+
+ if (ap->flags & XFSMNT_IOSIZE) {
+ if (ap->iosizelog > XFS_MAX_IO_LOG ||
+ ap->iosizelog < XFS_MIN_IO_LOG) {
+ cmn_err(CE_WARN,
+ "XFS: invalid log iosize: %d [not %d-%d]",
+ ap->iosizelog, XFS_MIN_IO_LOG,
+ XFS_MAX_IO_LOG);
+ return XFS_ERROR(EINVAL);
+ }
+
+ mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE;
+ mp->m_readio_log = mp->m_writeio_log = ap->iosizelog;
+ }
+
+ if (ap->flags & XFSMNT_IKEEP)
+ mp->m_flags |= XFS_MOUNT_IKEEP;
+ if (ap->flags & XFSMNT_DIRSYNC)
+ mp->m_flags |= XFS_MOUNT_DIRSYNC;
+ if (ap->flags & XFSMNT_ATTR2)
+ mp->m_flags |= XFS_MOUNT_ATTR2;
+ if (ap->flags & XFSMNT_NOATTR2)
+ mp->m_flags |= XFS_MOUNT_NOATTR2;
+
+ if (ap->flags2 & XFSMNT2_COMPAT_IOSIZE)
+ mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
+
+ /*
+ * no recovery flag requires a read-only mount
+ */
+ if (ap->flags & XFSMNT_NORECOVERY) {
+ if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
+ cmn_err(CE_WARN,
+ "XFS: tried to mount a FS read-write without recovery!");
+ return XFS_ERROR(EINVAL);
+ }
+ mp->m_flags |= XFS_MOUNT_NORECOVERY;
+ }
+
+ if (ap->flags & XFSMNT_NOUUID)
+ mp->m_flags |= XFS_MOUNT_NOUUID;
+ if (ap->flags & XFSMNT_BARRIER)
+ mp->m_flags |= XFS_MOUNT_BARRIER;
+ else
+ mp->m_flags &= ~XFS_MOUNT_BARRIER;
+
+ if (ap->flags2 & XFSMNT2_FILESTREAMS)
+ mp->m_flags |= XFS_MOUNT_FILESTREAMS;
+
+ if (ap->flags & XFSMNT_DMAPI)
+ mp->m_flags |= XFS_MOUNT_DMAPI;
+ return 0;
+
+
+ out_free_rtname:
+ kfree(mp->m_rtname);
+ out_free_fsname:
+ kfree(mp->m_fsname);
+ out:
+ return error;
+}
+
+/*
+ * This function fills in xfs_mount_t fields based on mount args.
+ * Note: the superblock _has_ now been read in.
+ */
+STATIC int
+xfs_finish_flags(
+ struct xfs_mount_args *ap,
+ struct xfs_mount *mp)
+{
+ int ronly = (mp->m_flags & XFS_MOUNT_RDONLY);
+
+ /* Fail a mount where the logbuf is smaller then the log stripe */
+ if (xfs_sb_version_haslogv2(&mp->m_sb)) {
+ if ((ap->logbufsize <= 0) &&
+ (mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE)) {
+ mp->m_logbsize = mp->m_sb.sb_logsunit;
+ } else if (ap->logbufsize > 0 &&
+ ap->logbufsize < mp->m_sb.sb_logsunit) {
+ cmn_err(CE_WARN,
+ "XFS: logbuf size must be greater than or equal to log stripe size");
+ return XFS_ERROR(EINVAL);
+ }
+ } else {
+ /* Fail a mount if the logbuf is larger than 32K */
+ if (ap->logbufsize > XLOG_BIG_RECORD_BSIZE) {
+ cmn_err(CE_WARN,
+ "XFS: logbuf size for version 1 logs must be 16K or 32K");
+ return XFS_ERROR(EINVAL);
+ }
+ }
+
+ /*
+ * mkfs'ed attr2 will turn on attr2 mount unless explicitly
+ * told by noattr2 to turn it off
+ */
+ if (xfs_sb_version_hasattr2(&mp->m_sb) &&
+ !(ap->flags & XFSMNT_NOATTR2))
+ mp->m_flags |= XFS_MOUNT_ATTR2;
+
+ /*
+ * prohibit r/w mounts of read-only filesystems
+ */
+ if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) {
+ cmn_err(CE_WARN,
+ "XFS: cannot mount a read-only filesystem as read-write");
+ return XFS_ERROR(EROFS);
+ }
+
+ /*
+ * check for shared mount.
+ */
+ if (ap->flags & XFSMNT_SHARED) {
+ if (!xfs_sb_version_hasshared(&mp->m_sb))
+ return XFS_ERROR(EINVAL);
+
+ /*
+ * For IRIX 6.5, shared mounts must have the shared
+ * version bit set, have the persistent readonly
+ * field set, must be version 0 and can only be mounted
+ * read-only.
+ */
+ if (!ronly || !(mp->m_sb.sb_flags & XFS_SBF_READONLY) ||
+ (mp->m_sb.sb_shared_vn != 0))
+ return XFS_ERROR(EINVAL);
+
+ mp->m_flags |= XFS_MOUNT_SHARED;
+
+ /*
+ * Shared XFS V0 can't deal with DMI. Return EINVAL.
+ */
+ if (mp->m_sb.sb_shared_vn == 0 && (ap->flags & XFSMNT_DMAPI))
+ return XFS_ERROR(EINVAL);
+ }
+
+ if (ap->flags & XFSMNT_UQUOTA) {
+ mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
+ if (ap->flags & XFSMNT_UQUOTAENF)
+ mp->m_qflags |= XFS_UQUOTA_ENFD;
+ }
+
+ if (ap->flags & XFSMNT_GQUOTA) {
+ mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
+ if (ap->flags & XFSMNT_GQUOTAENF)
+ mp->m_qflags |= XFS_OQUOTA_ENFD;
+ } else if (ap->flags & XFSMNT_PQUOTA) {
+ mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
+ if (ap->flags & XFSMNT_PQUOTAENF)
+ mp->m_qflags |= XFS_OQUOTA_ENFD;
+ }
+
+ return 0;
+}
+
STATIC int
xfs_fs_fill_super(
struct super_block *sb,
@@ -1308,11 +1667,21 @@ xfs_fs_fill_super(
{
struct inode *root;
struct xfs_mount *mp = NULL;
- struct xfs_mount_args *args = xfs_args_allocate(sb, silent);
- int error;
+ struct xfs_mount_args *args;
+ int flags = 0, error = ENOMEM;
- mp = xfs_mount_init();
+ args = xfs_args_allocate(sb, silent);
+ if (!args)
+ return -ENOMEM;
+ mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL);
+ if (!mp)
+ goto out_free_args;
+
+ spin_lock_init(&mp->m_sb_lock);
+ mutex_init(&mp->m_ilock);
+ mutex_init(&mp->m_growlock);
+ atomic_set(&mp->m_active_trans, 0);
INIT_LIST_HEAD(&mp->m_sync_list);
spin_lock_init(&mp->m_sync_lock);
init_waitqueue_head(&mp->m_wait_single_sync_task);
@@ -1325,16 +1694,60 @@ xfs_fs_fill_super(
error = xfs_parseargs(mp, (char *)data, args, 0);
if (error)
- goto fail_vfsop;
+ goto out_free_mp;
sb_min_blocksize(sb, BBSIZE);
+ sb->s_xattr = xfs_xattr_handlers;
sb->s_export_op = &xfs_export_operations;
sb->s_qcop = &xfs_quotactl_operations;
sb->s_op = &xfs_super_operations;
- error = xfs_mount(mp, args, NULL);
+ error = xfs_dmops_get(mp, args);
+ if (error)
+ goto out_free_mp;
+ error = xfs_qmops_get(mp, args);
+ if (error)
+ goto out_put_dmops;
+
+ if (args->flags & XFSMNT_QUIET)
+ flags |= XFS_MFSI_QUIET;
+
+ error = xfs_open_devices(mp, args);
+ if (error)
+ goto out_put_qmops;
+
+ if (xfs_icsb_init_counters(mp))
+ mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB;
+
+ /*
+ * Setup flags based on mount(2) options and then the superblock
+ */
+ error = xfs_start_flags(args, mp);
+ if (error)
+ goto out_free_fsname;
+ error = xfs_readsb(mp, flags);
+ if (error)
+ goto out_free_fsname;
+ error = xfs_finish_flags(args, mp);
if (error)
- goto fail_vfsop;
+ goto out_free_sb;
+
+ error = xfs_setup_devices(mp);
+ if (error)
+ goto out_free_sb;
+
+ if (mp->m_flags & XFS_MOUNT_BARRIER)
+ xfs_mountfs_check_barriers(mp);
+
+ error = xfs_filestream_mount(mp);
+ if (error)
+ goto out_free_sb;
+
+ error = xfs_mountfs(mp);
+ if (error)
+ goto out_filestream_unmount;
+
+ XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, args->mtpt, args->fsname);
sb->s_dirt = 1;
sb->s_magic = XFS_SB_MAGIC;
@@ -1344,7 +1757,7 @@ xfs_fs_fill_super(
sb->s_time_gran = 1;
set_posix_acl_flag(sb);
- root = igrab(mp->m_rootip->i_vnode);
+ root = igrab(VFS_I(mp->m_rootip));
if (!root) {
error = ENOENT;
goto fail_unmount;
@@ -1369,10 +1782,28 @@ xfs_fs_fill_super(
xfs_itrace_exit(XFS_I(sb->s_root->d_inode));
- kmem_free(args, sizeof(*args));
+ kfree(args);
return 0;
-fail_vnrele:
+ out_filestream_unmount:
+ xfs_filestream_unmount(mp);
+ out_free_sb:
+ xfs_freesb(mp);
+ out_free_fsname:
+ xfs_free_fsname(mp);
+ xfs_icsb_destroy_counters(mp);
+ xfs_close_devices(mp);
+ out_put_qmops:
+ xfs_qmops_put(mp);
+ out_put_dmops:
+ xfs_dmops_put(mp);
+ out_free_mp:
+ kfree(mp);
+ out_free_args:
+ kfree(args);
+ return -error;
+
+ fail_vnrele:
if (sb->s_root) {
dput(sb->s_root);
sb->s_root = NULL;
@@ -1380,12 +1811,20 @@ fail_vnrele:
iput(root);
}
-fail_unmount:
- xfs_unmount(mp, 0, NULL);
+ fail_unmount:
+ /*
+ * Blow away any referenced inode in the filestreams cache.
+ * This can and will cause log traffic as inodes go inactive
+ * here.
+ */
+ xfs_filestream_unmount(mp);
-fail_vfsop:
- kmem_free(args, sizeof(*args));
- return -error;
+ XFS_bflush(mp->m_ddev_targp);
+ error = xfs_unmount_flush(mp, 0);
+ WARN_ON(error);
+
+ xfs_unmountfs(mp);
+ goto out_free_sb;
}
STATIC int
@@ -1430,9 +1869,235 @@ static struct file_system_type xfs_fs_type = {
.fs_flags = FS_REQUIRES_DEV,
};
+STATIC int __init
+xfs_alloc_trace_bufs(void)
+{
+#ifdef XFS_ALLOC_TRACE
+ xfs_alloc_trace_buf = ktrace_alloc(XFS_ALLOC_TRACE_SIZE, KM_MAYFAIL);
+ if (!xfs_alloc_trace_buf)
+ goto out;
+#endif
+#ifdef XFS_BMAP_TRACE
+ xfs_bmap_trace_buf = ktrace_alloc(XFS_BMAP_TRACE_SIZE, KM_MAYFAIL);
+ if (!xfs_bmap_trace_buf)
+ goto out_free_alloc_trace;
+#endif
+#ifdef XFS_BMBT_TRACE
+ xfs_bmbt_trace_buf = ktrace_alloc(XFS_BMBT_TRACE_SIZE, KM_MAYFAIL);
+ if (!xfs_bmbt_trace_buf)
+ goto out_free_bmap_trace;
+#endif
+#ifdef XFS_ATTR_TRACE
+ xfs_attr_trace_buf = ktrace_alloc(XFS_ATTR_TRACE_SIZE, KM_MAYFAIL);
+ if (!xfs_attr_trace_buf)
+ goto out_free_bmbt_trace;
+#endif
+#ifdef XFS_DIR2_TRACE
+ xfs_dir2_trace_buf = ktrace_alloc(XFS_DIR2_GTRACE_SIZE, KM_MAYFAIL);
+ if (!xfs_dir2_trace_buf)
+ goto out_free_attr_trace;
+#endif
+
+ return 0;
+
+#ifdef XFS_DIR2_TRACE
+ out_free_attr_trace:
+#endif
+#ifdef XFS_ATTR_TRACE
+ ktrace_free(xfs_attr_trace_buf);
+ out_free_bmbt_trace:
+#endif
+#ifdef XFS_BMBT_TRACE
+ ktrace_free(xfs_bmbt_trace_buf);
+ out_free_bmap_trace:
+#endif
+#ifdef XFS_BMAP_TRACE
+ ktrace_free(xfs_bmap_trace_buf);
+ out_free_alloc_trace:
+#endif
+#ifdef XFS_ALLOC_TRACE
+ ktrace_free(xfs_alloc_trace_buf);
+ out:
+#endif
+ return -ENOMEM;
+}
+
+STATIC void
+xfs_free_trace_bufs(void)
+{
+#ifdef XFS_DIR2_TRACE
+ ktrace_free(xfs_dir2_trace_buf);
+#endif
+#ifdef XFS_ATTR_TRACE
+ ktrace_free(xfs_attr_trace_buf);
+#endif
+#ifdef XFS_BMBT_TRACE
+ ktrace_free(xfs_bmbt_trace_buf);
+#endif
+#ifdef XFS_BMAP_TRACE
+ ktrace_free(xfs_bmap_trace_buf);
+#endif
+#ifdef XFS_ALLOC_TRACE
+ ktrace_free(xfs_alloc_trace_buf);
+#endif
+}
+
+STATIC int __init
+xfs_init_zones(void)
+{
+ xfs_vnode_zone = kmem_zone_init_flags(sizeof(struct inode), "xfs_vnode",
+ KM_ZONE_HWALIGN | KM_ZONE_RECLAIM |
+ KM_ZONE_SPREAD,
+ xfs_fs_inode_init_once);
+ if (!xfs_vnode_zone)
+ goto out;
+
+ xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend");
+ if (!xfs_ioend_zone)
+ goto out_destroy_vnode_zone;
+
+ xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE,
+ xfs_ioend_zone);
+ if (!xfs_ioend_pool)
+ goto out_destroy_ioend_zone;
+
+ xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t),
+ "xfs_log_ticket");
+ if (!xfs_log_ticket_zone)
+ goto out_destroy_ioend_pool;
+
+ xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t),
+ "xfs_bmap_free_item");
+ if (!xfs_bmap_free_item_zone)
+ goto out_destroy_log_ticket_zone;
+ xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t),
+ "xfs_btree_cur");
+ if (!xfs_btree_cur_zone)
+ goto out_destroy_bmap_free_item_zone;
+
+ xfs_da_state_zone = kmem_zone_init(sizeof(xfs_da_state_t),
+ "xfs_da_state");
+ if (!xfs_da_state_zone)
+ goto out_destroy_btree_cur_zone;
+
+ xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf");
+ if (!xfs_dabuf_zone)
+ goto out_destroy_da_state_zone;
+
+ xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork");
+ if (!xfs_ifork_zone)
+ goto out_destroy_dabuf_zone;
+
+ xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans");
+ if (!xfs_trans_zone)
+ goto out_destroy_ifork_zone;
+
+ /*
+ * The size of the zone allocated buf log item is the maximum
+ * size possible under XFS. This wastes a little bit of memory,
+ * but it is much faster.
+ */
+ xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) +
+ (((XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK) /
+ NBWORD) * sizeof(int))), "xfs_buf_item");
+ if (!xfs_buf_item_zone)
+ goto out_destroy_trans_zone;
+
+ xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) +
+ ((XFS_EFD_MAX_FAST_EXTENTS - 1) *
+ sizeof(xfs_extent_t))), "xfs_efd_item");
+ if (!xfs_efd_zone)
+ goto out_destroy_buf_item_zone;
+
+ xfs_efi_zone = kmem_zone_init((sizeof(xfs_efi_log_item_t) +
+ ((XFS_EFI_MAX_FAST_EXTENTS - 1) *
+ sizeof(xfs_extent_t))), "xfs_efi_item");
+ if (!xfs_efi_zone)
+ goto out_destroy_efd_zone;
+
+ xfs_inode_zone =
+ kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode",
+ KM_ZONE_HWALIGN | KM_ZONE_RECLAIM |
+ KM_ZONE_SPREAD, NULL);
+ if (!xfs_inode_zone)
+ goto out_destroy_efi_zone;
+
+ xfs_ili_zone =
+ kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili",
+ KM_ZONE_SPREAD, NULL);
+ if (!xfs_ili_zone)
+ goto out_destroy_inode_zone;
+
+#ifdef CONFIG_XFS_POSIX_ACL
+ xfs_acl_zone = kmem_zone_init(sizeof(xfs_acl_t), "xfs_acl");
+ if (!xfs_acl_zone)
+ goto out_destroy_ili_zone;
+#endif
+
+ return 0;
+
+#ifdef CONFIG_XFS_POSIX_ACL
+ out_destroy_ili_zone:
+#endif
+ kmem_zone_destroy(xfs_ili_zone);
+ out_destroy_inode_zone:
+ kmem_zone_destroy(xfs_inode_zone);
+ out_destroy_efi_zone:
+ kmem_zone_destroy(xfs_efi_zone);
+ out_destroy_efd_zone:
+ kmem_zone_destroy(xfs_efd_zone);
+ out_destroy_buf_item_zone:
+ kmem_zone_destroy(xfs_buf_item_zone);
+ out_destroy_trans_zone:
+ kmem_zone_destroy(xfs_trans_zone);
+ out_destroy_ifork_zone:
+ kmem_zone_destroy(xfs_ifork_zone);
+ out_destroy_dabuf_zone:
+ kmem_zone_destroy(xfs_dabuf_zone);
+ out_destroy_da_state_zone:
+ kmem_zone_destroy(xfs_da_state_zone);
+ out_destroy_btree_cur_zone:
+ kmem_zone_destroy(xfs_btree_cur_zone);
+ out_destroy_bmap_free_item_zone:
+ kmem_zone_destroy(xfs_bmap_free_item_zone);
+ out_destroy_log_ticket_zone:
+ kmem_zone_destroy(xfs_log_ticket_zone);
+ out_destroy_ioend_pool:
+ mempool_destroy(xfs_ioend_pool);
+ out_destroy_ioend_zone:
+ kmem_zone_destroy(xfs_ioend_zone);
+ out_destroy_vnode_zone:
+ kmem_zone_destroy(xfs_vnode_zone);
+ out:
+ return -ENOMEM;
+}
+
+STATIC void
+xfs_destroy_zones(void)
+{
+#ifdef CONFIG_XFS_POSIX_ACL
+ kmem_zone_destroy(xfs_acl_zone);
+#endif
+ kmem_zone_destroy(xfs_ili_zone);
+ kmem_zone_destroy(xfs_inode_zone);
+ kmem_zone_destroy(xfs_efi_zone);
+ kmem_zone_destroy(xfs_efd_zone);
+ kmem_zone_destroy(xfs_buf_item_zone);
+ kmem_zone_destroy(xfs_trans_zone);
+ kmem_zone_destroy(xfs_ifork_zone);
+ kmem_zone_destroy(xfs_dabuf_zone);
+ kmem_zone_destroy(xfs_da_state_zone);
+ kmem_zone_destroy(xfs_btree_cur_zone);
+ kmem_zone_destroy(xfs_bmap_free_item_zone);
+ kmem_zone_destroy(xfs_log_ticket_zone);
+ mempool_destroy(xfs_ioend_pool);
+ kmem_zone_destroy(xfs_ioend_zone);
+ kmem_zone_destroy(xfs_vnode_zone);
+
+}
STATIC int __init
-init_xfs_fs( void )
+init_xfs_fs(void)
{
int error;
static char message[] __initdata = KERN_INFO \
@@ -1441,42 +2106,73 @@ init_xfs_fs( void )
printk(message);
ktrace_init(64);
+ vn_init();
+ xfs_dir_startup();
error = xfs_init_zones();
- if (error < 0)
- goto undo_zones;
+ if (error)
+ goto out;
+
+ error = xfs_alloc_trace_bufs();
+ if (error)
+ goto out_destroy_zones;
+
+ error = xfs_mru_cache_init();
+ if (error)
+ goto out_free_trace_buffers;
+
+ error = xfs_filestream_init();
+ if (error)
+ goto out_mru_cache_uninit;
error = xfs_buf_init();
- if (error < 0)
- goto undo_buffers;
+ if (error)
+ goto out_filestream_uninit;
+
+ error = xfs_init_procfs();
+ if (error)
+ goto out_buf_terminate;
+
+ error = xfs_sysctl_register();
+ if (error)
+ goto out_cleanup_procfs;
- vn_init();
- xfs_init();
- uuid_init();
vfs_initquota();
error = register_filesystem(&xfs_fs_type);
if (error)
- goto undo_register;
+ goto out_sysctl_unregister;
return 0;
-undo_register:
+ out_sysctl_unregister:
+ xfs_sysctl_unregister();
+ out_cleanup_procfs:
+ xfs_cleanup_procfs();
+ out_buf_terminate:
xfs_buf_terminate();
-
-undo_buffers:
+ out_filestream_uninit:
+ xfs_filestream_uninit();
+ out_mru_cache_uninit:
+ xfs_mru_cache_uninit();
+ out_free_trace_buffers:
+ xfs_free_trace_bufs();
+ out_destroy_zones:
xfs_destroy_zones();
-
-undo_zones:
+ out:
return error;
}
STATIC void __exit
-exit_xfs_fs( void )
+exit_xfs_fs(void)
{
vfs_exitquota();
unregister_filesystem(&xfs_fs_type);
- xfs_cleanup();
+ xfs_sysctl_unregister();
+ xfs_cleanup_procfs();
xfs_buf_terminate();
+ xfs_filestream_uninit();
+ xfs_mru_cache_uninit();
+ xfs_free_trace_bufs();
xfs_destroy_zones();
ktrace_uninit();
}
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 3efb7c6d330..fe2ef4e6a0f 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -101,18 +101,13 @@ struct block_device;
extern __uint64_t xfs_max_file_offset(unsigned int);
-extern void xfs_initialize_vnode(struct xfs_mount *mp, bhv_vnode_t *vp,
- struct xfs_inode *ip);
-
extern void xfs_flush_inode(struct xfs_inode *);
extern void xfs_flush_device(struct xfs_inode *);
-extern int xfs_blkdev_get(struct xfs_mount *, const char *,
- struct block_device **);
-extern void xfs_blkdev_put(struct block_device *);
extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
extern const struct export_operations xfs_export_operations;
+extern struct xattr_handler *xfs_xattr_handlers[];
#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info))
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index bb997d75c05..7dacb5bbde3 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -259,15 +259,17 @@ static ctl_table xfs_root_table[] = {
{}
};
-void
+int
xfs_sysctl_register(void)
{
xfs_table_header = register_sysctl_table(xfs_root_table);
+ if (!xfs_table_header)
+ return -ENOMEM;
+ return 0;
}
void
xfs_sysctl_unregister(void)
{
- if (xfs_table_header)
- unregister_sysctl_table(xfs_table_header);
+ unregister_sysctl_table(xfs_table_header);
}
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.h b/fs/xfs/linux-2.6/xfs_sysctl.h
index 98b97e399d6..4aadb8056c3 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.h
+++ b/fs/xfs/linux-2.6/xfs_sysctl.h
@@ -93,10 +93,10 @@ enum {
extern xfs_param_t xfs_params;
#ifdef CONFIG_SYSCTL
-extern void xfs_sysctl_register(void);
+extern int xfs_sysctl_register(void);
extern void xfs_sysctl_unregister(void);
#else
-# define xfs_sysctl_register() do { } while (0)
+# define xfs_sysctl_register() (0)
# define xfs_sysctl_unregister() do { } while (0)
#endif /* CONFIG_SYSCTL */
diff --git a/fs/xfs/linux-2.6/xfs_vnode.c b/fs/xfs/linux-2.6/xfs_vnode.c
index bc7afe00733..b52528bbbff 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.c
+++ b/fs/xfs/linux-2.6/xfs_vnode.c
@@ -33,7 +33,7 @@
/*
- * Dedicated vnode inactive/reclaim sync semaphores.
+ * Dedicated vnode inactive/reclaim sync wait queues.
* Prime number of hash buckets since address is used as the key.
*/
#define NVSYNC 37
@@ -82,74 +82,6 @@ vn_ioerror(
xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ, f, l);
}
-/*
- * Revalidate the Linux inode from the XFS inode.
- * Note: i_size _not_ updated; we must hold the inode
- * semaphore when doing that - callers responsibility.
- */
-int
-vn_revalidate(
- bhv_vnode_t *vp)
-{
- struct inode *inode = vn_to_inode(vp);
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- unsigned long xflags;
-
- xfs_itrace_entry(ip);
-
- if (XFS_FORCED_SHUTDOWN(mp))
- return -EIO;
-
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- inode->i_mode = ip->i_d.di_mode;
- inode->i_uid = ip->i_d.di_uid;
- inode->i_gid = ip->i_d.di_gid;
- inode->i_mtime.tv_sec = ip->i_d.di_mtime.t_sec;
- inode->i_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec;
- inode->i_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
- inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
-
- xflags = xfs_ip2xflags(ip);
- if (xflags & XFS_XFLAG_IMMUTABLE)
- inode->i_flags |= S_IMMUTABLE;
- else
- inode->i_flags &= ~S_IMMUTABLE;
- if (xflags & XFS_XFLAG_APPEND)
- inode->i_flags |= S_APPEND;
- else
- inode->i_flags &= ~S_APPEND;
- if (xflags & XFS_XFLAG_SYNC)
- inode->i_flags |= S_SYNC;
- else
- inode->i_flags &= ~S_SYNC;
- if (xflags & XFS_XFLAG_NOATIME)
- inode->i_flags |= S_NOATIME;
- else
- inode->i_flags &= ~S_NOATIME;
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
- xfs_iflags_clear(ip, XFS_IMODIFIED);
- return 0;
-}
-
-/*
- * Add a reference to a referenced vnode.
- */
-bhv_vnode_t *
-vn_hold(
- bhv_vnode_t *vp)
-{
- struct inode *inode;
-
- XFS_STATS_INC(vn_hold);
-
- inode = igrab(vn_to_inode(vp));
- ASSERT(inode);
-
- return vp;
-}
-
#ifdef XFS_INODE_TRACE
/*
@@ -158,7 +90,7 @@ vn_hold(
*/
static inline int xfs_icount(struct xfs_inode *ip)
{
- bhv_vnode_t *vp = XFS_ITOV_NULL(ip);
+ struct inode *vp = VFS_I(ip);
if (vp)
return vn_count(vp);
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index 25eb2a9e8d9..683ce16210f 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -19,24 +19,9 @@
#define __XFS_VNODE_H__
struct file;
-struct bhv_vattr;
struct xfs_iomap;
struct attrlist_cursor_kern;
-typedef struct inode bhv_vnode_t;
-
-/*
- * Vnode to Linux inode mapping.
- */
-static inline bhv_vnode_t *vn_from_inode(struct inode *inode)
-{
- return inode;
-}
-static inline struct inode *vn_to_inode(bhv_vnode_t *vnode)
-{
- return vnode;
-}
-
/*
* Return values for xfs_inactive. A return value of
* VN_INACTIVE_NOCACHE implies that the file system behavior
@@ -66,87 +51,8 @@ static inline struct inode *vn_to_inode(bhv_vnode_t *vnode)
Prevent VM access to the pages until
the operation completes. */
-/*
- * Vnode attributes. va_mask indicates those attributes the caller
- * wants to set or extract.
- */
-typedef struct bhv_vattr {
- int va_mask; /* bit-mask of attributes present */
- mode_t va_mode; /* file access mode and type */
- xfs_nlink_t va_nlink; /* number of references to file */
- uid_t va_uid; /* owner user id */
- gid_t va_gid; /* owner group id */
- xfs_ino_t va_nodeid; /* file id */
- xfs_off_t va_size; /* file size in bytes */
- u_long va_blocksize; /* blocksize preferred for i/o */
- struct timespec va_atime; /* time of last access */
- struct timespec va_mtime; /* time of last modification */
- struct timespec va_ctime; /* time file changed */
- u_int va_gen; /* generation number of file */
- xfs_dev_t va_rdev; /* device the special file represents */
- __int64_t va_nblocks; /* number of blocks allocated */
- u_long va_xflags; /* random extended file flags */
- u_long va_extsize; /* file extent size */
- u_long va_nextents; /* number of extents in file */
- u_long va_anextents; /* number of attr extents in file */
- prid_t va_projid; /* project id */
-} bhv_vattr_t;
-
-/*
- * setattr or getattr attributes
- */
-#define XFS_AT_TYPE 0x00000001
-#define XFS_AT_MODE 0x00000002
-#define XFS_AT_UID 0x00000004
-#define XFS_AT_GID 0x00000008
-#define XFS_AT_FSID 0x00000010
-#define XFS_AT_NODEID 0x00000020
-#define XFS_AT_NLINK 0x00000040
-#define XFS_AT_SIZE 0x00000080
-#define XFS_AT_ATIME 0x00000100
-#define XFS_AT_MTIME 0x00000200
-#define XFS_AT_CTIME 0x00000400
-#define XFS_AT_RDEV 0x00000800
-#define XFS_AT_BLKSIZE 0x00001000
-#define XFS_AT_NBLOCKS 0x00002000
-#define XFS_AT_VCODE 0x00004000
-#define XFS_AT_MAC 0x00008000
-#define XFS_AT_UPDATIME 0x00010000
-#define XFS_AT_UPDMTIME 0x00020000
-#define XFS_AT_UPDCTIME 0x00040000
-#define XFS_AT_ACL 0x00080000
-#define XFS_AT_CAP 0x00100000
-#define XFS_AT_INF 0x00200000
-#define XFS_AT_XFLAGS 0x00400000
-#define XFS_AT_EXTSIZE 0x00800000
-#define XFS_AT_NEXTENTS 0x01000000
-#define XFS_AT_ANEXTENTS 0x02000000
-#define XFS_AT_PROJID 0x04000000
-#define XFS_AT_SIZE_NOPERM 0x08000000
-#define XFS_AT_GENCOUNT 0x10000000
-
-#define XFS_AT_ALL (XFS_AT_TYPE|XFS_AT_MODE|XFS_AT_UID|XFS_AT_GID|\
- XFS_AT_FSID|XFS_AT_NODEID|XFS_AT_NLINK|XFS_AT_SIZE|\
- XFS_AT_ATIME|XFS_AT_MTIME|XFS_AT_CTIME|XFS_AT_RDEV|\
- XFS_AT_BLKSIZE|XFS_AT_NBLOCKS|XFS_AT_VCODE|XFS_AT_MAC|\
- XFS_AT_ACL|XFS_AT_CAP|XFS_AT_INF|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|\
- XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|XFS_AT_PROJID|XFS_AT_GENCOUNT)
-
-#define XFS_AT_STAT (XFS_AT_TYPE|XFS_AT_MODE|XFS_AT_UID|XFS_AT_GID|\
- XFS_AT_FSID|XFS_AT_NODEID|XFS_AT_NLINK|XFS_AT_SIZE|\
- XFS_AT_ATIME|XFS_AT_MTIME|XFS_AT_CTIME|XFS_AT_RDEV|\
- XFS_AT_BLKSIZE|XFS_AT_NBLOCKS|XFS_AT_PROJID)
-
-#define XFS_AT_TIMES (XFS_AT_ATIME|XFS_AT_MTIME|XFS_AT_CTIME)
-
-#define XFS_AT_UPDTIMES (XFS_AT_UPDATIME|XFS_AT_UPDMTIME|XFS_AT_UPDCTIME)
-
-#define XFS_AT_NOSET (XFS_AT_NLINK|XFS_AT_RDEV|XFS_AT_FSID|XFS_AT_NODEID|\
- XFS_AT_TYPE|XFS_AT_BLKSIZE|XFS_AT_NBLOCKS|XFS_AT_VCODE|\
- XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|XFS_AT_GENCOUNT)
extern void vn_init(void);
-extern int vn_revalidate(bhv_vnode_t *);
/*
* Yeah, these don't take vnode anymore at all, all this should be
@@ -156,57 +62,52 @@ extern void vn_iowait(struct xfs_inode *ip);
extern void vn_iowake(struct xfs_inode *ip);
extern void vn_ioerror(struct xfs_inode *ip, int error, char *f, int l);
-static inline int vn_count(bhv_vnode_t *vp)
+static inline int vn_count(struct inode *vp)
{
- return atomic_read(&vn_to_inode(vp)->i_count);
+ return atomic_read(&vp->i_count);
}
-/*
- * Vnode reference counting functions (and macros for compatibility).
- */
-extern bhv_vnode_t *vn_hold(bhv_vnode_t *);
+#define IHOLD(ip) \
+do { \
+ ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
+ atomic_inc(&(VFS_I(ip)->i_count)); \
+ xfs_itrace_hold((ip), __FILE__, __LINE__, (inst_t *)__return_address); \
+} while (0)
-#if defined(XFS_INODE_TRACE)
-#define VN_HOLD(vp) \
- ((void)vn_hold(vp), \
- xfs_itrace_hold(xfs_vtoi(vp), __FILE__, __LINE__, (inst_t *)__return_address))
-#define VN_RELE(vp) \
- (xfs_itrace_rele(xfs_vtoi(vp), __FILE__, __LINE__, (inst_t *)__return_address), \
- iput(vn_to_inode(vp)))
-#else
-#define VN_HOLD(vp) ((void)vn_hold(vp))
-#define VN_RELE(vp) (iput(vn_to_inode(vp)))
-#endif
+#define IRELE(ip) \
+do { \
+ xfs_itrace_rele((ip), __FILE__, __LINE__, (inst_t *)__return_address); \
+ iput(VFS_I(ip)); \
+} while (0)
-static inline bhv_vnode_t *vn_grab(bhv_vnode_t *vp)
+static inline struct inode *vn_grab(struct inode *vp)
{
- struct inode *inode = igrab(vn_to_inode(vp));
- return inode ? vn_from_inode(inode) : NULL;
+ return igrab(vp);
}
/*
* Dealing with bad inodes
*/
-static inline int VN_BAD(bhv_vnode_t *vp)
+static inline int VN_BAD(struct inode *vp)
{
- return is_bad_inode(vn_to_inode(vp));
+ return is_bad_inode(vp);
}
/*
* Extracting atime values in various formats
*/
-static inline void vn_atime_to_bstime(bhv_vnode_t *vp, xfs_bstime_t *bs_atime)
+static inline void vn_atime_to_bstime(struct inode *vp, xfs_bstime_t *bs_atime)
{
bs_atime->tv_sec = vp->i_atime.tv_sec;
bs_atime->tv_nsec = vp->i_atime.tv_nsec;
}
-static inline void vn_atime_to_timespec(bhv_vnode_t *vp, struct timespec *ts)
+static inline void vn_atime_to_timespec(struct inode *vp, struct timespec *ts)
{
*ts = vp->i_atime;
}
-static inline void vn_atime_to_time_t(bhv_vnode_t *vp, time_t *tt)
+static inline void vn_atime_to_time_t(struct inode *vp, time_t *tt)
{
*tt = vp->i_atime.tv_sec;
}
@@ -214,20 +115,11 @@ static inline void vn_atime_to_time_t(bhv_vnode_t *vp, time_t *tt)
/*
* Some useful predicates.
*/
-#define VN_MAPPED(vp) mapping_mapped(vn_to_inode(vp)->i_mapping)
-#define VN_CACHED(vp) (vn_to_inode(vp)->i_mapping->nrpages)
-#define VN_DIRTY(vp) mapping_tagged(vn_to_inode(vp)->i_mapping, \
+#define VN_MAPPED(vp) mapping_mapped(vp->i_mapping)
+#define VN_CACHED(vp) (vp->i_mapping->nrpages)
+#define VN_DIRTY(vp) mapping_tagged(vp->i_mapping, \
PAGECACHE_TAG_DIRTY)
-/*
- * Flags to vop_setattr/getattr.
- */
-#define ATTR_UTIME 0x01 /* non-default utime(2) request */
-#define ATTR_DMI 0x08 /* invocation from a DMI function */
-#define ATTR_LAZY 0x80 /* set/get attributes lazily */
-#define ATTR_NONBLOCK 0x100 /* return EAGAIN if operation would block */
-#define ATTR_NOLOCK 0x200 /* Don't grab any conflicting locks */
-#define ATTR_NOSIZETOK 0x400 /* Don't get the SIZE token */
/*
* Tracking vnode activity.
diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c
new file mode 100644
index 00000000000..964621fde6e
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_xattr.c
@@ -0,0 +1,330 @@
+/*
+ * Copyright (C) 2008 Christoph Hellwig.
+ * Portions Copyright (C) 2000-2008 Silicon Graphics, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "xfs.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_acl.h"
+#include "xfs_vnodeops.h"
+
+#include <linux/posix_acl_xattr.h>
+#include <linux/xattr.h>
+
+
+/*
+ * ACL handling. Should eventually be moved into xfs_acl.c
+ */
+
+static int
+xfs_decode_acl(const char *name)
+{
+ if (strcmp(name, "posix_acl_access") == 0)
+ return _ACL_TYPE_ACCESS;
+ else if (strcmp(name, "posix_acl_default") == 0)
+ return _ACL_TYPE_DEFAULT;
+ return -EINVAL;
+}
+
+/*
+ * Get system extended attributes which at the moment only
+ * includes Posix ACLs.
+ */
+static int
+xfs_xattr_system_get(struct inode *inode, const char *name,
+ void *buffer, size_t size)
+{
+ int acl;
+
+ acl = xfs_decode_acl(name);
+ if (acl < 0)
+ return acl;
+
+ return xfs_acl_vget(inode, buffer, size, acl);
+}
+
+static int
+xfs_xattr_system_set(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ int acl;
+
+ acl = xfs_decode_acl(name);
+ if (acl < 0)
+ return acl;
+ if (flags & XATTR_CREATE)
+ return -EINVAL;
+
+ if (!value)
+ return xfs_acl_vremove(inode, acl);
+
+ return xfs_acl_vset(inode, (void *)value, size, acl);
+}
+
+static struct xattr_handler xfs_xattr_system_handler = {
+ .prefix = XATTR_SYSTEM_PREFIX,
+ .get = xfs_xattr_system_get,
+ .set = xfs_xattr_system_set,
+};
+
+
+/*
+ * Real xattr handling. The only difference between the namespaces is
+ * a flag passed to the low-level attr code.
+ */
+
+static int
+__xfs_xattr_get(struct inode *inode, const char *name,
+ void *value, size_t size, int xflags)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ int error, asize = size;
+
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+
+ /* Convert Linux syscall to XFS internal ATTR flags */
+ if (!size) {
+ xflags |= ATTR_KERNOVAL;
+ value = NULL;
+ }
+
+ error = -xfs_attr_get(ip, name, value, &asize, xflags);
+ if (error)
+ return error;
+ return asize;
+}
+
+static int
+__xfs_xattr_set(struct inode *inode, const char *name, const void *value,
+ size_t size, int flags, int xflags)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+
+ /* Convert Linux syscall to XFS internal ATTR flags */
+ if (flags & XATTR_CREATE)
+ xflags |= ATTR_CREATE;
+ if (flags & XATTR_REPLACE)
+ xflags |= ATTR_REPLACE;
+
+ if (!value)
+ return -xfs_attr_remove(ip, name, xflags);
+ return -xfs_attr_set(ip, name, (void *)value, size, xflags);
+}
+
+static int
+xfs_xattr_user_get(struct inode *inode, const char *name,
+ void *value, size_t size)
+{
+ return __xfs_xattr_get(inode, name, value, size, 0);
+}
+
+static int
+xfs_xattr_user_set(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ return __xfs_xattr_set(inode, name, value, size, flags, 0);
+}
+
+static struct xattr_handler xfs_xattr_user_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .get = xfs_xattr_user_get,
+ .set = xfs_xattr_user_set,
+};
+
+
+static int
+xfs_xattr_trusted_get(struct inode *inode, const char *name,
+ void *value, size_t size)
+{
+ return __xfs_xattr_get(inode, name, value, size, ATTR_ROOT);
+}
+
+static int
+xfs_xattr_trusted_set(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ return __xfs_xattr_set(inode, name, value, size, flags, ATTR_ROOT);
+}
+
+static struct xattr_handler xfs_xattr_trusted_handler = {
+ .prefix = XATTR_TRUSTED_PREFIX,
+ .get = xfs_xattr_trusted_get,
+ .set = xfs_xattr_trusted_set,
+};
+
+
+static int
+xfs_xattr_secure_get(struct inode *inode, const char *name,
+ void *value, size_t size)
+{
+ return __xfs_xattr_get(inode, name, value, size, ATTR_SECURE);
+}
+
+static int
+xfs_xattr_secure_set(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ return __xfs_xattr_set(inode, name, value, size, flags, ATTR_SECURE);
+}
+
+static struct xattr_handler xfs_xattr_security_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .get = xfs_xattr_secure_get,
+ .set = xfs_xattr_secure_set,
+};
+
+
+struct xattr_handler *xfs_xattr_handlers[] = {
+ &xfs_xattr_user_handler,
+ &xfs_xattr_trusted_handler,
+ &xfs_xattr_security_handler,
+ &xfs_xattr_system_handler,
+ NULL
+};
+
+static unsigned int xfs_xattr_prefix_len(int flags)
+{
+ if (flags & XFS_ATTR_SECURE)
+ return sizeof("security");
+ else if (flags & XFS_ATTR_ROOT)
+ return sizeof("trusted");
+ else
+ return sizeof("user");
+}
+
+static const char *xfs_xattr_prefix(int flags)
+{
+ if (flags & XFS_ATTR_SECURE)
+ return xfs_xattr_security_handler.prefix;
+ else if (flags & XFS_ATTR_ROOT)
+ return xfs_xattr_trusted_handler.prefix;
+ else
+ return xfs_xattr_user_handler.prefix;
+}
+
+static int
+xfs_xattr_put_listent(struct xfs_attr_list_context *context, int flags,
+ char *name, int namelen, int valuelen, char *value)
+{
+ unsigned int prefix_len = xfs_xattr_prefix_len(flags);
+ char *offset;
+ int arraytop;
+
+ ASSERT(context->count >= 0);
+
+ /*
+ * Only show root namespace entries if we are actually allowed to
+ * see them.
+ */
+ if ((flags & XFS_ATTR_ROOT) && !capable(CAP_SYS_ADMIN))
+ return 0;
+
+ arraytop = context->count + prefix_len + namelen + 1;
+ if (arraytop > context->firstu) {
+ context->count = -1; /* insufficient space */
+ return 1;
+ }
+ offset = (char *)context->alist + context->count;
+ strncpy(offset, xfs_xattr_prefix(flags), prefix_len);
+ offset += prefix_len;
+ strncpy(offset, name, namelen); /* real name */
+ offset += namelen;
+ *offset = '\0';
+ context->count += prefix_len + namelen + 1;
+ return 0;
+}
+
+static int
+xfs_xattr_put_listent_sizes(struct xfs_attr_list_context *context, int flags,
+ char *name, int namelen, int valuelen, char *value)
+{
+ context->count += xfs_xattr_prefix_len(flags) + namelen + 1;
+ return 0;
+}
+
+static int
+list_one_attr(const char *name, const size_t len, void *data,
+ size_t size, ssize_t *result)
+{
+ char *p = data + *result;
+
+ *result += len;
+ if (!size)
+ return 0;
+ if (*result > size)
+ return -ERANGE;
+
+ strcpy(p, name);
+ return 0;
+}
+
+ssize_t
+xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size)
+{
+ struct xfs_attr_list_context context;
+ struct attrlist_cursor_kern cursor = { 0 };
+ struct inode *inode = dentry->d_inode;
+ int error;
+
+ /*
+ * First read the regular on-disk attributes.
+ */
+ memset(&context, 0, sizeof(context));
+ context.dp = XFS_I(inode);
+ context.cursor = &cursor;
+ context.resynch = 1;
+ context.alist = data;
+ context.bufsize = size;
+ context.firstu = context.bufsize;
+
+ if (size)
+ context.put_listent = xfs_xattr_put_listent;
+ else
+ context.put_listent = xfs_xattr_put_listent_sizes;
+
+ xfs_attr_list_int(&context);
+ if (context.count < 0)
+ return -ERANGE;
+
+ /*
+ * Then add the two synthetic ACL attributes.
+ */
+ if (xfs_acl_vhasacl_access(inode)) {
+ error = list_one_attr(POSIX_ACL_XATTR_ACCESS,
+ strlen(POSIX_ACL_XATTR_ACCESS) + 1,
+ data, size, &context.count);
+ if (error)
+ return error;
+ }
+
+ if (xfs_acl_vhasacl_default(inode)) {
+ error = list_one_attr(POSIX_ACL_XATTR_DEFAULT,
+ strlen(POSIX_ACL_XATTR_DEFAULT) + 1,
+ data, size, &context.count);
+ if (error)
+ return error;
+ }
+
+ return context.count;
+}
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 85df3288efd..f2705f2fd43 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -101,11 +101,18 @@ xfs_qm_dqinit(
if (brandnewdquot) {
dqp->dq_flnext = dqp->dq_flprev = dqp;
mutex_init(&dqp->q_qlock);
- initnsema(&dqp->q_flock, 1, "fdq");
sv_init(&dqp->q_pinwait, SV_DEFAULT, "pdq");
+ /*
+ * Because we want to use a counting completion, complete
+ * the flush completion once to allow a single access to
+ * the flush completion without blocking.
+ */
+ init_completion(&dqp->q_flush);
+ complete(&dqp->q_flush);
+
#ifdef XFS_DQUOT_TRACE
- dqp->q_trace = ktrace_alloc(DQUOT_TRACE_SIZE, KM_SLEEP);
+ dqp->q_trace = ktrace_alloc(DQUOT_TRACE_SIZE, KM_NOFS);
xfs_dqtrace_entry(dqp, "DQINIT");
#endif
} else {
@@ -150,7 +157,6 @@ xfs_qm_dqdestroy(
ASSERT(! XFS_DQ_IS_ON_FREELIST(dqp));
mutex_destroy(&dqp->q_qlock);
- freesema(&dqp->q_flock);
sv_destroy(&dqp->q_pinwait);
#ifdef XFS_DQUOT_TRACE
@@ -431,7 +437,7 @@ xfs_qm_dqalloc(
* when it unlocks the inode. Since we want to keep the quota
* inode around, we bump the vnode ref count now.
*/
- VN_HOLD(XFS_ITOV(quotip));
+ IHOLD(quotip);
xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
nmaps = 1;
@@ -1211,7 +1217,7 @@ xfs_qm_dqflush(
int error;
ASSERT(XFS_DQ_IS_LOCKED(dqp));
- ASSERT(XFS_DQ_IS_FLUSH_LOCKED(dqp));
+ ASSERT(!completion_done(&dqp->q_flush));
xfs_dqtrace_entry(dqp, "DQFLUSH");
/*
@@ -1348,34 +1354,18 @@ xfs_qm_dqflush_done(
xfs_dqfunlock(dqp);
}
-
-int
-xfs_qm_dqflock_nowait(
- xfs_dquot_t *dqp)
-{
- int locked;
-
- locked = cpsema(&((dqp)->q_flock));
-
- /* XXX ifdef these out */
- if (locked)
- (dqp)->dq_flags |= XFS_DQ_FLOCKED;
- return (locked);
-}
-
-
int
xfs_qm_dqlock_nowait(
xfs_dquot_t *dqp)
{
- return (mutex_trylock(&((dqp)->q_qlock)));
+ return mutex_trylock(&dqp->q_qlock);
}
void
xfs_dqlock(
xfs_dquot_t *dqp)
{
- mutex_lock(&(dqp->q_qlock));
+ mutex_lock(&dqp->q_qlock);
}
void
@@ -1435,8 +1425,7 @@ xfs_dqlock2(
/* ARGSUSED */
int
xfs_qm_dqpurge(
- xfs_dquot_t *dqp,
- uint flags)
+ xfs_dquot_t *dqp)
{
xfs_dqhash_t *thishash;
xfs_mount_t *mp = dqp->q_mount;
@@ -1469,7 +1458,7 @@ xfs_qm_dqpurge(
* if we're turning off quotas. Basically, we need this flush
* lock, and are willing to block on it.
*/
- if (! xfs_qm_dqflock_nowait(dqp)) {
+ if (!xfs_dqflock_nowait(dqp)) {
/*
* Block on the flush lock after nudging dquot buffer,
* if it is incore.
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index 5c371a92e3e..8958d0faf8d 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -82,7 +82,7 @@ typedef struct xfs_dquot {
xfs_qcnt_t q_res_icount; /* total inos allocd+reserved */
xfs_qcnt_t q_res_rtbcount;/* total realtime blks used+reserved */
mutex_t q_qlock; /* quota lock */
- sema_t q_flock; /* flush lock */
+ struct completion q_flush; /* flush completion queue */
uint q_pincount; /* pin count for this dquot */
sv_t q_pinwait; /* sync var for pinning */
#ifdef XFS_DQUOT_TRACE
@@ -113,17 +113,25 @@ XFS_DQ_IS_LOCKED(xfs_dquot_t *dqp)
/*
- * The following three routines simply manage the q_flock
- * semaphore embedded in the dquot. This semaphore synchronizes
- * processes attempting to flush the in-core dquot back to disk.
+ * Manage the q_flush completion queue embedded in the dquot. This completion
+ * queue synchronizes processes attempting to flush the in-core dquot back to
+ * disk.
*/
-#define xfs_dqflock(dqp) { psema(&((dqp)->q_flock), PINOD | PRECALC);\
- (dqp)->dq_flags |= XFS_DQ_FLOCKED; }
-#define xfs_dqfunlock(dqp) { ASSERT(issemalocked(&((dqp)->q_flock))); \
- vsema(&((dqp)->q_flock)); \
- (dqp)->dq_flags &= ~(XFS_DQ_FLOCKED); }
+static inline void xfs_dqflock(xfs_dquot_t *dqp)
+{
+ wait_for_completion(&dqp->q_flush);
+}
+
+static inline int xfs_dqflock_nowait(xfs_dquot_t *dqp)
+{
+ return try_wait_for_completion(&dqp->q_flush);
+}
+
+static inline void xfs_dqfunlock(xfs_dquot_t *dqp)
+{
+ complete(&dqp->q_flush);
+}
-#define XFS_DQ_IS_FLUSH_LOCKED(dqp) (issemalocked(&((dqp)->q_flock)))
#define XFS_DQ_IS_ON_FREELIST(dqp) ((dqp)->dq_flnext != (dqp))
#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY)
#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER)
@@ -164,10 +172,9 @@ extern void xfs_qm_dqprint(xfs_dquot_t *);
extern void xfs_qm_dqdestroy(xfs_dquot_t *);
extern int xfs_qm_dqflush(xfs_dquot_t *, uint);
-extern int xfs_qm_dqpurge(xfs_dquot_t *, uint);
+extern int xfs_qm_dqpurge(xfs_dquot_t *);
extern void xfs_qm_dqunpin_wait(xfs_dquot_t *);
extern int xfs_qm_dqlock_nowait(xfs_dquot_t *);
-extern int xfs_qm_dqflock_nowait(xfs_dquot_t *);
extern void xfs_qm_dqflock_pushbuf_wait(xfs_dquot_t *dqp);
extern void xfs_qm_adjust_dqtimers(xfs_mount_t *,
xfs_disk_dquot_t *);
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 36e05ca7841..f028644caa5 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -151,7 +151,7 @@ xfs_qm_dquot_logitem_push(
dqp = logitem->qli_dquot;
ASSERT(XFS_DQ_IS_LOCKED(dqp));
- ASSERT(XFS_DQ_IS_FLUSH_LOCKED(dqp));
+ ASSERT(!completion_done(&dqp->q_flush));
/*
* Since we were able to lock the dquot's flush lock and
@@ -245,7 +245,7 @@ xfs_qm_dquot_logitem_pushbuf(
* inode flush completed and the inode was taken off the AIL.
* So, just get out.
*/
- if (!issemalocked(&(dqp->q_flock)) ||
+ if (completion_done(&dqp->q_flush) ||
((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) {
qip->qli_pushbuf_flag = 0;
xfs_dqunlock(dqp);
@@ -258,7 +258,7 @@ xfs_qm_dquot_logitem_pushbuf(
if (bp != NULL) {
if (XFS_BUF_ISDELAYWRITE(bp)) {
dopush = ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
- issemalocked(&(dqp->q_flock)));
+ !completion_done(&dqp->q_flush));
qip->qli_pushbuf_flag = 0;
xfs_dqunlock(dqp);
@@ -317,7 +317,7 @@ xfs_qm_dquot_logitem_trylock(
return (XFS_ITEM_LOCKED);
retval = XFS_ITEM_SUCCESS;
- if (! xfs_qm_dqflock_nowait(dqp)) {
+ if (!xfs_dqflock_nowait(dqp)) {
/*
* The dquot is already being flushed. It may have been
* flushed delayed write, however, and we don't want to
@@ -576,8 +576,8 @@ xfs_qm_qoffend_logitem_committed(
* xfs_trans_delete_ail() drops the AIL lock.
*/
xfs_trans_delete_ail(qfs->qql_item.li_mountp, (xfs_log_item_t *)qfs);
- kmem_free(qfs, sizeof(xfs_qoff_logitem_t));
- kmem_free(qfe, sizeof(xfs_qoff_logitem_t));
+ kmem_free(qfs);
+ kmem_free(qfe);
return (xfs_lsn_t)-1;
}
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index d31cce1165c..df0ffef9775 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -192,8 +192,8 @@ xfs_qm_destroy(
xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i]));
xfs_qm_list_destroy(&(xqm->qm_grp_dqhtable[i]));
}
- kmem_free(xqm->qm_usr_dqhtable, hsize * sizeof(xfs_dqhash_t));
- kmem_free(xqm->qm_grp_dqhtable, hsize * sizeof(xfs_dqhash_t));
+ kmem_free(xqm->qm_usr_dqhtable);
+ kmem_free(xqm->qm_grp_dqhtable);
xqm->qm_usr_dqhtable = NULL;
xqm->qm_grp_dqhtable = NULL;
xqm->qm_dqhashmask = 0;
@@ -201,7 +201,7 @@ xfs_qm_destroy(
#ifdef DEBUG
mutex_destroy(&qcheck_lock);
#endif
- kmem_free(xqm, sizeof(xfs_qm_t));
+ kmem_free(xqm);
}
/*
@@ -310,8 +310,7 @@ xfs_qm_unmount_quotadestroy(
*/
void
xfs_qm_mount_quotas(
- xfs_mount_t *mp,
- int mfsi_flags)
+ xfs_mount_t *mp)
{
int error = 0;
uint sbf;
@@ -346,8 +345,7 @@ xfs_qm_mount_quotas(
/*
* If any of the quotas are not consistent, do a quotacheck.
*/
- if (XFS_QM_NEED_QUOTACHECK(mp) &&
- !(mfsi_flags & XFS_MFSI_NO_QUOTACHECK)) {
+ if (XFS_QM_NEED_QUOTACHECK(mp)) {
error = xfs_qm_quotacheck(mp);
if (error) {
/* Quotacheck failed and disabled quotas. */
@@ -445,11 +443,11 @@ xfs_qm_unmount_quotas(
}
}
if (uqp) {
- XFS_PURGE_INODE(uqp);
+ IRELE(uqp);
mp->m_quotainfo->qi_uquotaip = NULL;
}
if (gqp) {
- XFS_PURGE_INODE(gqp);
+ IRELE(gqp);
mp->m_quotainfo->qi_gquotaip = NULL;
}
out:
@@ -484,7 +482,7 @@ again:
xfs_dqtrace_entry(dqp, "FLUSHALL: DQDIRTY");
/* XXX a sentinel would be better */
recl = XFS_QI_MPLRECLAIMS(mp);
- if (! xfs_qm_dqflock_nowait(dqp)) {
+ if (!xfs_dqflock_nowait(dqp)) {
/*
* If we can't grab the flush lock then check
* to see if the dquot has been flushed delayed
@@ -631,7 +629,7 @@ xfs_qm_dqpurge_int(
* freelist in INACTIVE state.
*/
nextdqp = dqp->MPL_NEXT;
- nmisses += xfs_qm_dqpurge(dqp, flags);
+ nmisses += xfs_qm_dqpurge(dqp);
dqp = nextdqp;
}
xfs_qm_mplist_unlock(mp);
@@ -1062,7 +1060,7 @@ xfs_qm_sync(
/* XXX a sentinel would be better */
recl = XFS_QI_MPLRECLAIMS(mp);
- if (! xfs_qm_dqflock_nowait(dqp)) {
+ if (!xfs_dqflock_nowait(dqp)) {
if (nowait) {
xfs_dqunlock(dqp);
continue;
@@ -1134,7 +1132,7 @@ xfs_qm_init_quotainfo(
* and change the superblock accordingly.
*/
if ((error = xfs_qm_init_quotainos(mp))) {
- kmem_free(qinf, sizeof(xfs_quotainfo_t));
+ kmem_free(qinf);
mp->m_quotainfo = NULL;
return error;
}
@@ -1240,15 +1238,15 @@ xfs_qm_destroy_quotainfo(
xfs_qm_list_destroy(&qi->qi_dqlist);
if (qi->qi_uquotaip) {
- XFS_PURGE_INODE(qi->qi_uquotaip);
+ IRELE(qi->qi_uquotaip);
qi->qi_uquotaip = NULL; /* paranoia */
}
if (qi->qi_gquotaip) {
- XFS_PURGE_INODE(qi->qi_gquotaip);
+ IRELE(qi->qi_gquotaip);
qi->qi_gquotaip = NULL;
}
mutex_destroy(&qi->qi_quotaofflock);
- kmem_free(qi, sizeof(xfs_quotainfo_t));
+ kmem_free(qi);
mp->m_quotainfo = NULL;
}
@@ -1394,7 +1392,7 @@ xfs_qm_qino_alloc(
* locked exclusively and joined to the transaction already.
*/
ASSERT(xfs_isilocked(*ip, XFS_ILOCK_EXCL));
- VN_HOLD(XFS_ITOV((*ip)));
+ IHOLD(*ip);
/*
* Make the changes in the superblock, and log those too.
@@ -1623,7 +1621,7 @@ xfs_qm_dqiterate(
break;
} while (nmaps > 0);
- kmem_free(map, XFS_DQITER_MAP_SIZE * sizeof(*map));
+ kmem_free(map);
return error;
}
@@ -2079,7 +2077,7 @@ xfs_qm_shake_freelist(
* Try to grab the flush lock. If this dquot is in the process of
* getting flushed to disk, we don't want to reclaim it.
*/
- if (! xfs_qm_dqflock_nowait(dqp)) {
+ if (!xfs_dqflock_nowait(dqp)) {
xfs_dqunlock(dqp);
dqp = dqp->dq_flnext;
continue;
@@ -2257,7 +2255,7 @@ xfs_qm_dqreclaim_one(void)
* Try to grab the flush lock. If this dquot is in the process of
* getting flushed to disk, we don't want to reclaim it.
*/
- if (! xfs_qm_dqflock_nowait(dqp)) {
+ if (!xfs_dqflock_nowait(dqp)) {
xfs_dqunlock(dqp);
continue;
}
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index cd2300e374a..44f25349e47 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -165,7 +165,7 @@ typedef struct xfs_dquot_acct {
#define XFS_QM_RELE(xqm) ((xqm)->qm_nrefs--)
extern void xfs_qm_destroy_quotainfo(xfs_mount_t *);
-extern void xfs_qm_mount_quotas(xfs_mount_t *, int);
+extern void xfs_qm_mount_quotas(xfs_mount_t *);
extern int xfs_qm_quotacheck(xfs_mount_t *);
extern void xfs_qm_unmount_quotadestroy(xfs_mount_t *);
extern int xfs_qm_unmount_quotas(xfs_mount_t *);
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index f4f6c4c861d..eea2e60b456 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -162,7 +162,7 @@ xfs_qm_newmount(
* mounting, and get on with the boring life
* without disk quotas.
*/
- xfs_qm_mount_quotas(mp, 0);
+ xfs_qm_mount_quotas(mp);
} else {
/*
* Clear the quota flags, but remember them. This
@@ -184,13 +184,12 @@ STATIC int
xfs_qm_endmount(
xfs_mount_t *mp,
uint needquotamount,
- uint quotaflags,
- int mfsi_flags)
+ uint quotaflags)
{
if (needquotamount) {
ASSERT(mp->m_qflags == 0);
mp->m_qflags = quotaflags;
- xfs_qm_mount_quotas(mp, mfsi_flags);
+ xfs_qm_mount_quotas(mp);
}
#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 768a3b27d2b..1a3b803dfa5 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -362,11 +362,11 @@ xfs_qm_scall_quotaoff(
* if we don't need them anymore.
*/
if ((dqtype & XFS_QMOPT_UQUOTA) && XFS_QI_UQIP(mp)) {
- XFS_PURGE_INODE(XFS_QI_UQIP(mp));
+ IRELE(XFS_QI_UQIP(mp));
XFS_QI_UQIP(mp) = NULL;
}
if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && XFS_QI_GQIP(mp)) {
- XFS_PURGE_INODE(XFS_QI_GQIP(mp));
+ IRELE(XFS_QI_GQIP(mp));
XFS_QI_GQIP(mp) = NULL;
}
out_error:
@@ -1034,7 +1034,7 @@ xfs_qm_dqrele_all_inodes(
{
xfs_inode_t *ip, *topino;
uint ireclaims;
- bhv_vnode_t *vp;
+ struct inode *vp;
boolean_t vnode_refd;
ASSERT(mp->m_quotainfo);
@@ -1059,7 +1059,7 @@ again:
ip = ip->i_mnext;
continue;
}
- vp = XFS_ITOV_NULL(ip);
+ vp = VFS_I(ip);
if (!vp) {
ASSERT(ip->i_udquot == NULL);
ASSERT(ip->i_gdquot == NULL);
@@ -1449,14 +1449,14 @@ xfs_qm_internalqcheck(
for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) {
xfs_dqtest_cmp(d);
e = (xfs_dqtest_t *) d->HL_NEXT;
- kmem_free(d, sizeof(xfs_dqtest_t));
+ kmem_free(d);
d = e;
}
h1 = &qmtest_gdqtab[i];
for (d = (xfs_dqtest_t *) h1->qh_next; d != NULL; ) {
xfs_dqtest_cmp(d);
e = (xfs_dqtest_t *) d->HL_NEXT;
- kmem_free(d, sizeof(xfs_dqtest_t));
+ kmem_free(d);
d = e;
}
}
@@ -1467,8 +1467,8 @@ xfs_qm_internalqcheck(
} else {
cmn_err(CE_DEBUG, "******** quotacheck successful! ********");
}
- kmem_free(qmtest_udqtab, qmtest_hashmask * sizeof(xfs_dqhash_t));
- kmem_free(qmtest_gdqtab, qmtest_hashmask * sizeof(xfs_dqhash_t));
+ kmem_free(qmtest_udqtab);
+ kmem_free(qmtest_gdqtab);
mutex_unlock(&qcheck_lock);
return (qmtest_nfails);
}
diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h
index 5e4a40b1c56..c4fcea600bc 100644
--- a/fs/xfs/quota/xfs_quota_priv.h
+++ b/fs/xfs/quota/xfs_quota_priv.h
@@ -158,9 +158,6 @@ for ((dqp) = (qlist)->qh_next; (dqp) != (xfs_dquot_t *)(qlist); \
#define XFS_IS_SUSER_DQUOT(dqp) \
(!((dqp)->q_core.d_id))
-#define XFS_PURGE_INODE(ip) \
- IRELE(ip);
-
#define DQFLAGTO_TYPESTR(d) (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \
(((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \
(((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???")))
diff --git a/fs/xfs/support/ktrace.c b/fs/xfs/support/ktrace.c
index 0b75d302508..a34ef05489b 100644
--- a/fs/xfs/support/ktrace.c
+++ b/fs/xfs/support/ktrace.c
@@ -89,7 +89,7 @@ ktrace_alloc(int nentries, unsigned int __nocast sleep)
if (sleep & KM_SLEEP)
panic("ktrace_alloc: NULL memory on KM_SLEEP request!");
- kmem_free(ktp, sizeof(*ktp));
+ kmem_free(ktp);
return NULL;
}
@@ -126,7 +126,7 @@ ktrace_free(ktrace_t *ktp)
} else {
entries_size = (int)(ktp->kt_nentries * sizeof(ktrace_entry_t));
- kmem_free(ktp->kt_entries, entries_size);
+ kmem_free(ktp->kt_entries);
}
kmem_zone_free(ktrace_hdr_zone, ktp);
diff --git a/fs/xfs/support/uuid.c b/fs/xfs/support/uuid.c
index 493a6ecf859..5830c040ea7 100644
--- a/fs/xfs/support/uuid.c
+++ b/fs/xfs/support/uuid.c
@@ -17,7 +17,7 @@
*/
#include <xfs.h>
-static mutex_t uuid_monitor;
+static DEFINE_MUTEX(uuid_monitor);
static int uuid_table_size;
static uuid_t *uuid_table;
@@ -132,9 +132,3 @@ uuid_table_remove(uuid_t *uuid)
ASSERT(i < uuid_table_size);
mutex_unlock(&uuid_monitor);
}
-
-void __init
-uuid_init(void)
-{
- mutex_init(&uuid_monitor);
-}
diff --git a/fs/xfs/support/uuid.h b/fs/xfs/support/uuid.h
index b6f5922199b..cff5b607d44 100644
--- a/fs/xfs/support/uuid.h
+++ b/fs/xfs/support/uuid.h
@@ -22,7 +22,6 @@ typedef struct {
unsigned char __u_bits[16];
} uuid_t;
-extern void uuid_init(void);
extern void uuid_create_nil(uuid_t *uuid);
extern int uuid_is_nil(uuid_t *uuid);
extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2);
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index ebee3a4f703..b2f639a1416 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -37,15 +37,15 @@
#include <linux/capability.h>
#include <linux/posix_acl_xattr.h>
-STATIC int xfs_acl_setmode(bhv_vnode_t *, xfs_acl_t *, int *);
+STATIC int xfs_acl_setmode(struct inode *, xfs_acl_t *, int *);
STATIC void xfs_acl_filter_mode(mode_t, xfs_acl_t *);
STATIC void xfs_acl_get_endian(xfs_acl_t *);
STATIC int xfs_acl_access(uid_t, gid_t, xfs_acl_t *, mode_t, cred_t *);
STATIC int xfs_acl_invalid(xfs_acl_t *);
STATIC void xfs_acl_sync_mode(mode_t, xfs_acl_t *);
-STATIC void xfs_acl_get_attr(bhv_vnode_t *, xfs_acl_t *, int, int, int *);
-STATIC void xfs_acl_set_attr(bhv_vnode_t *, xfs_acl_t *, int, int *);
-STATIC int xfs_acl_allow_set(bhv_vnode_t *, int);
+STATIC void xfs_acl_get_attr(struct inode *, xfs_acl_t *, int, int, int *);
+STATIC void xfs_acl_set_attr(struct inode *, xfs_acl_t *, int, int *);
+STATIC int xfs_acl_allow_set(struct inode *, int);
kmem_zone_t *xfs_acl_zone;
@@ -55,7 +55,7 @@ kmem_zone_t *xfs_acl_zone;
*/
int
xfs_acl_vhasacl_access(
- bhv_vnode_t *vp)
+ struct inode *vp)
{
int error;
@@ -68,7 +68,7 @@ xfs_acl_vhasacl_access(
*/
int
xfs_acl_vhasacl_default(
- bhv_vnode_t *vp)
+ struct inode *vp)
{
int error;
@@ -207,7 +207,7 @@ posix_acl_xfs_to_xattr(
int
xfs_acl_vget(
- bhv_vnode_t *vp,
+ struct inode *vp,
void *acl,
size_t size,
int kind)
@@ -217,7 +217,6 @@ xfs_acl_vget(
posix_acl_xattr_header *ext_acl = acl;
int flags = 0;
- VN_HOLD(vp);
if(size) {
if (!(_ACL_ALLOC(xfs_acl))) {
error = ENOMEM;
@@ -239,11 +238,10 @@ xfs_acl_vget(
goto out;
}
if (kind == _ACL_TYPE_ACCESS)
- xfs_acl_sync_mode(xfs_vtoi(vp)->i_d.di_mode, xfs_acl);
+ xfs_acl_sync_mode(XFS_I(vp)->i_d.di_mode, xfs_acl);
error = -posix_acl_xfs_to_xattr(xfs_acl, ext_acl, size);
}
out:
- VN_RELE(vp);
if(xfs_acl)
_ACL_FREE(xfs_acl);
return -error;
@@ -251,28 +249,26 @@ out:
int
xfs_acl_vremove(
- bhv_vnode_t *vp,
+ struct inode *vp,
int kind)
{
int error;
- VN_HOLD(vp);
error = xfs_acl_allow_set(vp, kind);
if (!error) {
- error = xfs_attr_remove(xfs_vtoi(vp),
+ error = xfs_attr_remove(XFS_I(vp),
kind == _ACL_TYPE_DEFAULT?
SGI_ACL_DEFAULT: SGI_ACL_FILE,
ATTR_ROOT);
if (error == ENOATTR)
error = 0; /* 'scool */
}
- VN_RELE(vp);
return -error;
}
int
xfs_acl_vset(
- bhv_vnode_t *vp,
+ struct inode *vp,
void *acl,
size_t size,
int kind)
@@ -298,7 +294,6 @@ xfs_acl_vset(
return 0;
}
- VN_HOLD(vp);
error = xfs_acl_allow_set(vp, kind);
/* Incoming ACL exists, set file mode based on its value */
@@ -321,7 +316,6 @@ xfs_acl_vset(
}
out:
- VN_RELE(vp);
_ACL_FREE(xfs_acl);
return -error;
}
@@ -341,8 +335,7 @@ xfs_acl_iaccess(
/* If the file has no ACL return -1. */
rval = sizeof(xfs_acl_t);
- if (xfs_attr_fetch(ip, &acl_name, (char *)acl, &rval,
- ATTR_ROOT | ATTR_KERNACCESS)) {
+ if (xfs_attr_fetch(ip, &acl_name, (char *)acl, &rval, ATTR_ROOT)) {
_ACL_FREE(acl);
return -1;
}
@@ -364,7 +357,7 @@ xfs_acl_iaccess(
STATIC int
xfs_acl_allow_set(
- bhv_vnode_t *vp,
+ struct inode *vp,
int kind)
{
if (vp->i_flags & (S_IMMUTABLE|S_APPEND))
@@ -373,7 +366,7 @@ xfs_acl_allow_set(
return ENOTDIR;
if (vp->i_sb->s_flags & MS_RDONLY)
return EROFS;
- if (xfs_vtoi(vp)->i_d.di_uid != current->fsuid && !capable(CAP_FOWNER))
+ if (XFS_I(vp)->i_d.di_uid != current->fsuid && !capable(CAP_FOWNER))
return EPERM;
return 0;
}
@@ -567,7 +560,7 @@ xfs_acl_get_endian(
*/
STATIC void
xfs_acl_get_attr(
- bhv_vnode_t *vp,
+ struct inode *vp,
xfs_acl_t *aclp,
int kind,
int flags,
@@ -577,7 +570,7 @@ xfs_acl_get_attr(
ASSERT((flags & ATTR_KERNOVAL) ? (aclp == NULL) : 1);
flags |= ATTR_ROOT;
- *error = xfs_attr_get(xfs_vtoi(vp),
+ *error = xfs_attr_get(XFS_I(vp),
kind == _ACL_TYPE_ACCESS ?
SGI_ACL_FILE : SGI_ACL_DEFAULT,
(char *)aclp, &len, flags);
@@ -591,7 +584,7 @@ xfs_acl_get_attr(
*/
STATIC void
xfs_acl_set_attr(
- bhv_vnode_t *vp,
+ struct inode *vp,
xfs_acl_t *aclp,
int kind,
int *error)
@@ -616,7 +609,7 @@ xfs_acl_set_attr(
INT_SET(newace->ae_perm, ARCH_CONVERT, ace->ae_perm);
}
INT_SET(newacl->acl_cnt, ARCH_CONVERT, aclp->acl_cnt);
- *error = xfs_attr_set(xfs_vtoi(vp),
+ *error = xfs_attr_set(XFS_I(vp),
kind == _ACL_TYPE_ACCESS ?
SGI_ACL_FILE: SGI_ACL_DEFAULT,
(char *)newacl, len, ATTR_ROOT);
@@ -625,7 +618,7 @@ xfs_acl_set_attr(
int
xfs_acl_vtoacl(
- bhv_vnode_t *vp,
+ struct inode *vp,
xfs_acl_t *access_acl,
xfs_acl_t *default_acl)
{
@@ -640,7 +633,7 @@ xfs_acl_vtoacl(
if (error)
access_acl->acl_cnt = XFS_ACL_NOT_PRESENT;
else /* We have a good ACL and the file mode, synchronize. */
- xfs_acl_sync_mode(xfs_vtoi(vp)->i_d.di_mode, access_acl);
+ xfs_acl_sync_mode(XFS_I(vp)->i_d.di_mode, access_acl);
}
if (default_acl) {
@@ -657,7 +650,7 @@ xfs_acl_vtoacl(
*/
int
xfs_acl_inherit(
- bhv_vnode_t *vp,
+ struct inode *vp,
mode_t mode,
xfs_acl_t *pdaclp)
{
@@ -716,11 +709,11 @@ out_error:
*/
STATIC int
xfs_acl_setmode(
- bhv_vnode_t *vp,
+ struct inode *vp,
xfs_acl_t *acl,
int *basicperms)
{
- bhv_vattr_t va;
+ struct iattr iattr;
xfs_acl_entry_t *ap;
xfs_acl_entry_t *gap = NULL;
int i, nomask = 1;
@@ -734,25 +727,25 @@ xfs_acl_setmode(
* Copy the u::, g::, o::, and m:: bits from the ACL into the
* mode. The m:: bits take precedence over the g:: bits.
*/
- va.va_mask = XFS_AT_MODE;
- va.va_mode = xfs_vtoi(vp)->i_d.di_mode;
- va.va_mode &= ~(S_IRWXU|S_IRWXG|S_IRWXO);
+ iattr.ia_valid = ATTR_MODE;
+ iattr.ia_mode = XFS_I(vp)->i_d.di_mode;
+ iattr.ia_mode &= ~(S_IRWXU|S_IRWXG|S_IRWXO);
ap = acl->acl_entry;
for (i = 0; i < acl->acl_cnt; ++i) {
switch (ap->ae_tag) {
case ACL_USER_OBJ:
- va.va_mode |= ap->ae_perm << 6;
+ iattr.ia_mode |= ap->ae_perm << 6;
break;
case ACL_GROUP_OBJ:
gap = ap;
break;
case ACL_MASK: /* more than just standard modes */
nomask = 0;
- va.va_mode |= ap->ae_perm << 3;
+ iattr.ia_mode |= ap->ae_perm << 3;
*basicperms = 0;
break;
case ACL_OTHER:
- va.va_mode |= ap->ae_perm;
+ iattr.ia_mode |= ap->ae_perm;
break;
default: /* more than just standard modes */
*basicperms = 0;
@@ -763,9 +756,9 @@ xfs_acl_setmode(
/* Set the group bits from ACL_GROUP_OBJ if there's no ACL_MASK */
if (gap && nomask)
- va.va_mode |= gap->ae_perm << 3;
+ iattr.ia_mode |= gap->ae_perm << 3;
- return xfs_setattr(xfs_vtoi(vp), &va, 0, sys_cred);
+ return xfs_setattr(XFS_I(vp), &iattr, 0, sys_cred);
}
/*
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 332a772461c..a4e293b93ef 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -46,6 +46,8 @@ typedef struct xfs_acl {
#define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1)
#define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1)
+#define _ACL_TYPE_ACCESS 1
+#define _ACL_TYPE_DEFAULT 2
#ifdef CONFIG_XFS_POSIX_ACL
@@ -57,17 +59,15 @@ extern struct kmem_zone *xfs_acl_zone;
(zone) = kmem_zone_init(sizeof(xfs_acl_t), (name))
#define xfs_acl_zone_destroy(zone) kmem_zone_destroy(zone)
-extern int xfs_acl_inherit(bhv_vnode_t *, mode_t mode, xfs_acl_t *);
+extern int xfs_acl_inherit(struct inode *, mode_t mode, xfs_acl_t *);
extern int xfs_acl_iaccess(struct xfs_inode *, mode_t, cred_t *);
-extern int xfs_acl_vtoacl(bhv_vnode_t *, xfs_acl_t *, xfs_acl_t *);
-extern int xfs_acl_vhasacl_access(bhv_vnode_t *);
-extern int xfs_acl_vhasacl_default(bhv_vnode_t *);
-extern int xfs_acl_vset(bhv_vnode_t *, void *, size_t, int);
-extern int xfs_acl_vget(bhv_vnode_t *, void *, size_t, int);
-extern int xfs_acl_vremove(bhv_vnode_t *, int);
+extern int xfs_acl_vtoacl(struct inode *, xfs_acl_t *, xfs_acl_t *);
+extern int xfs_acl_vhasacl_access(struct inode *);
+extern int xfs_acl_vhasacl_default(struct inode *);
+extern int xfs_acl_vset(struct inode *, void *, size_t, int);
+extern int xfs_acl_vget(struct inode *, void *, size_t, int);
+extern int xfs_acl_vremove(struct inode *, int);
-#define _ACL_TYPE_ACCESS 1
-#define _ACL_TYPE_DEFAULT 2
#define _ACL_PERM_INVALID(perm) ((perm) & ~(ACL_READ|ACL_WRITE|ACL_EXECUTE))
#define _ACL_INHERIT(c,m,d) (xfs_acl_inherit(c,m,d))
diff --git a/fs/xfs/xfs_arch.h b/fs/xfs/xfs_arch.h
index f9472a2076d..0b3b5efe848 100644
--- a/fs/xfs/xfs_arch.h
+++ b/fs/xfs/xfs_arch.h
@@ -92,16 +92,6 @@
((__u8*)(pointer))[1] = (((value) ) & 0xff); \
}
-/* define generic INT_ macros */
-
-#define INT_GET(reference,arch) \
- (((arch) == ARCH_NOCONVERT) \
- ? \
- (reference) \
- : \
- INT_SWAP((reference),(reference)) \
- )
-
/* does not return a value */
#define INT_SET(reference,arch,valueref) \
(__builtin_constant_p(valueref) ? \
@@ -112,64 +102,6 @@
) \
)
-/* does not return a value */
-#define INT_MOD_EXPR(reference,arch,code) \
- (((arch) == ARCH_NOCONVERT) \
- ? \
- (void)((reference) code) \
- : \
- (void)( \
- (reference) = INT_GET((reference),arch) , \
- ((reference) code), \
- INT_SET(reference, arch, reference) \
- ) \
- )
-
-/* does not return a value */
-#define INT_MOD(reference,arch,delta) \
- (void)( \
- INT_MOD_EXPR(reference,arch,+=(delta)) \
- )
-
-/*
- * INT_COPY - copy a value between two locations with the
- * _same architecture_ but _potentially different sizes_
- *
- * if the types of the two parameters are equal or they are
- * in native architecture, a simple copy is done
- *
- * otherwise, architecture conversions are done
- *
- */
-
-/* does not return a value */
-#define INT_COPY(dst,src,arch) \
- ( \
- ((sizeof(dst) == sizeof(src)) || ((arch) == ARCH_NOCONVERT)) \
- ? \
- (void)((dst) = (src)) \
- : \
- INT_SET(dst, arch, INT_GET(src, arch)) \
- )
-
-/*
- * INT_XLATE - copy a value in either direction between two locations
- * with different architectures
- *
- * dir < 0 - copy from memory to buffer (native to arch)
- * dir > 0 - copy from buffer to memory (arch to native)
- */
-
-/* does not return a value */
-#define INT_XLATE(buf,mem,dir,arch) {\
- ASSERT(dir); \
- if (dir>0) { \
- (mem)=INT_GET(buf, arch); \
- } else { \
- INT_SET(buf, arch, mem); \
- } \
-}
-
/*
* In directories inode numbers are stored as unaligned arrays of unsigned
* 8bit integers on disk.
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index df151a85918..f7cdc28aff4 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -16,8 +16,6 @@
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
-#include <linux/capability.h>
-
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_types.h"
@@ -57,11 +55,6 @@
* Provide the external interfaces to manage attribute lists.
*/
-#define ATTR_SYSCOUNT 2
-static struct attrnames posix_acl_access;
-static struct attrnames posix_acl_default;
-static struct attrnames *attr_system_names[ATTR_SYSCOUNT];
-
/*========================================================================
* Function prototypes for the kernel.
*========================================================================*/
@@ -116,6 +109,17 @@ xfs_attr_name_to_xname(
return 0;
}
+STATIC int
+xfs_inode_hasattr(
+ struct xfs_inode *ip)
+{
+ if (!XFS_IFORK_Q(ip) ||
+ (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+ ip->i_d.di_anextents == 0))
+ return 0;
+ return 1;
+}
+
/*========================================================================
* Overall external interface routines.
*========================================================================*/
@@ -127,10 +131,8 @@ xfs_attr_fetch(xfs_inode_t *ip, struct xfs_name *name,
xfs_da_args_t args;
int error;
- if ((XFS_IFORK_Q(ip) == 0) ||
- (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
- ip->i_d.di_anextents == 0))
- return(ENOATTR);
+ if (!xfs_inode_hasattr(ip))
+ return ENOATTR;
/*
* Fill in the arg structure for this request.
@@ -148,11 +150,7 @@ xfs_attr_fetch(xfs_inode_t *ip, struct xfs_name *name,
/*
* Decide on what work routines to call based on the inode size.
*/
- if (XFS_IFORK_Q(ip) == 0 ||
- (ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
- ip->i_d.di_anextents == 0)) {
- error = XFS_ERROR(ENOATTR);
- } else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+ if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
error = xfs_attr_shortform_getvalue(&args);
} else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK)) {
error = xfs_attr_leaf_get(&args);
@@ -196,6 +194,46 @@ xfs_attr_get(
return(error);
}
+/*
+ * Calculate how many blocks we need for the new attribute,
+ */
+int
+xfs_attr_calc_size(
+ struct xfs_inode *ip,
+ int namelen,
+ int valuelen,
+ int *local)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ int size;
+ int nblks;
+
+ /*
+ * Determine space new attribute will use, and if it would be
+ * "local" or "remote" (note: local != inline).
+ */
+ size = xfs_attr_leaf_newentsize(namelen, valuelen,
+ mp->m_sb.sb_blocksize, local);
+
+ nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
+ if (*local) {
+ if (size > (mp->m_sb.sb_blocksize >> 1)) {
+ /* Double split possible */
+ nblks *= 2;
+ }
+ } else {
+ /*
+ * Out of line attribute, cannot double split, but
+ * make room for the attribute value itself.
+ */
+ uint dblocks = XFS_B_TO_FSB(mp, valuelen);
+ nblks += dblocks;
+ nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK);
+ }
+
+ return nblks;
+}
+
STATIC int
xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
char *value, int valuelen, int flags)
@@ -204,10 +242,9 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
xfs_fsblock_t firstblock;
xfs_bmap_free_t flist;
int error, err2, committed;
- int local, size;
- uint nblks;
xfs_mount_t *mp = dp->i_mount;
int rsvd = (flags & ATTR_ROOT) != 0;
+ int local;
/*
* Attach the dquots to the inode.
@@ -241,33 +278,10 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
args.firstblock = &firstblock;
args.flist = &flist;
args.whichfork = XFS_ATTR_FORK;
- args.addname = 1;
- args.oknoent = 1;
-
- /*
- * Determine space new attribute will use, and if it would be
- * "local" or "remote" (note: local != inline).
- */
- size = xfs_attr_leaf_newentsize(name->len, valuelen,
- mp->m_sb.sb_blocksize, &local);
-
- nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
- if (local) {
- if (size > (mp->m_sb.sb_blocksize >> 1)) {
- /* Double split possible */
- nblks <<= 1;
- }
- } else {
- uint dblocks = XFS_B_TO_FSB(mp, valuelen);
- /* Out of line attribute, cannot double split, but make
- * room for the attribute value itself.
- */
- nblks += dblocks;
- nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK);
- }
+ args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
/* Size is now blocks for attribute data */
- args.total = nblks;
+ args.total = xfs_attr_calc_size(dp, name->len, valuelen, &local);
/*
* Start our first transaction of the day.
@@ -289,18 +303,17 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
if (rsvd)
args.trans->t_flags |= XFS_TRANS_RESERVE;
- if ((error = xfs_trans_reserve(args.trans, (uint) nblks,
- XFS_ATTRSET_LOG_RES(mp, nblks),
- 0, XFS_TRANS_PERM_LOG_RES,
- XFS_ATTRSET_LOG_COUNT))) {
+ if ((error = xfs_trans_reserve(args.trans, args.total,
+ XFS_ATTRSET_LOG_RES(mp, args.total), 0,
+ XFS_TRANS_PERM_LOG_RES, XFS_ATTRSET_LOG_COUNT))) {
xfs_trans_cancel(args.trans, 0);
return(error);
}
xfs_ilock(dp, XFS_ILOCK_EXCL);
- error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, args.trans, dp, nblks, 0,
- rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
- XFS_QMOPT_RES_REGBLKS);
+ error = XFS_TRANS_RESERVE_QUOTA_NBLKS(mp, args.trans, dp, args.total, 0,
+ rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
+ XFS_QMOPT_RES_REGBLKS);
if (error) {
xfs_iunlock(dp, XFS_ILOCK_EXCL);
xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES);
@@ -387,7 +400,9 @@ xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name,
* Commit the leaf transformation. We'll need another (linked)
* transaction to add the new attribute to the leaf.
*/
- if ((error = xfs_attr_rolltrans(&args.trans, dp)))
+
+ error = xfs_trans_roll(&args.trans, dp);
+ if (error)
goto out;
}
@@ -529,9 +544,7 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
/*
* Decide on what work routines to call based on the inode size.
*/
- if (XFS_IFORK_Q(dp) == 0 ||
- (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
- dp->i_d.di_anextents == 0)) {
+ if (!xfs_inode_hasattr(dp)) {
error = XFS_ERROR(ENOATTR);
goto out;
}
@@ -601,29 +614,33 @@ xfs_attr_remove(
return error;
xfs_ilock(dp, XFS_ILOCK_SHARED);
- if (XFS_IFORK_Q(dp) == 0 ||
- (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
- dp->i_d.di_anextents == 0)) {
+ if (!xfs_inode_hasattr(dp)) {
xfs_iunlock(dp, XFS_ILOCK_SHARED);
- return(XFS_ERROR(ENOATTR));
+ return XFS_ERROR(ENOATTR);
}
xfs_iunlock(dp, XFS_ILOCK_SHARED);
return xfs_attr_remove_int(dp, &xname, flags);
}
-STATIC int
+int
xfs_attr_list_int(xfs_attr_list_context_t *context)
{
int error;
xfs_inode_t *dp = context->dp;
+ XFS_STATS_INC(xs_attr_list);
+
+ if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+ return EIO;
+
+ xfs_ilock(dp, XFS_ILOCK_SHARED);
+ xfs_attr_trace_l_c("syscall start", context);
+
/*
* Decide on what work routines to call based on the inode size.
*/
- if (XFS_IFORK_Q(dp) == 0 ||
- (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
- dp->i_d.di_anextents == 0)) {
+ if (!xfs_inode_hasattr(dp)) {
error = 0;
} else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
error = xfs_attr_shortform_list(context);
@@ -632,6 +649,10 @@ xfs_attr_list_int(xfs_attr_list_context_t *context)
} else {
error = xfs_attr_node_list(context);
}
+
+ xfs_iunlock(dp, XFS_ILOCK_SHARED);
+ xfs_attr_trace_l_c("syscall end", context);
+
return error;
}
@@ -648,74 +669,50 @@ xfs_attr_list_int(xfs_attr_list_context_t *context)
*/
/*ARGSUSED*/
STATIC int
-xfs_attr_put_listent(xfs_attr_list_context_t *context, attrnames_t *namesp,
+xfs_attr_put_listent(xfs_attr_list_context_t *context, int flags,
char *name, int namelen,
int valuelen, char *value)
{
+ struct attrlist *alist = (struct attrlist *)context->alist;
attrlist_ent_t *aep;
int arraytop;
ASSERT(!(context->flags & ATTR_KERNOVAL));
ASSERT(context->count >= 0);
ASSERT(context->count < (ATTR_MAX_VALUELEN/8));
- ASSERT(context->firstu >= sizeof(*context->alist));
+ ASSERT(context->firstu >= sizeof(*alist));
ASSERT(context->firstu <= context->bufsize);
- arraytop = sizeof(*context->alist) +
- context->count * sizeof(context->alist->al_offset[0]);
+ /*
+ * Only list entries in the right namespace.
+ */
+ if (((context->flags & ATTR_SECURE) == 0) !=
+ ((flags & XFS_ATTR_SECURE) == 0))
+ return 0;
+ if (((context->flags & ATTR_ROOT) == 0) !=
+ ((flags & XFS_ATTR_ROOT) == 0))
+ return 0;
+
+ arraytop = sizeof(*alist) +
+ context->count * sizeof(alist->al_offset[0]);
context->firstu -= ATTR_ENTSIZE(namelen);
if (context->firstu < arraytop) {
xfs_attr_trace_l_c("buffer full", context);
- context->alist->al_more = 1;
+ alist->al_more = 1;
context->seen_enough = 1;
return 1;
}
- aep = (attrlist_ent_t *)&(((char *)context->alist)[ context->firstu ]);
+ aep = (attrlist_ent_t *)&context->alist[context->firstu];
aep->a_valuelen = valuelen;
memcpy(aep->a_name, name, namelen);
- aep->a_name[ namelen ] = 0;
- context->alist->al_offset[ context->count++ ] = context->firstu;
- context->alist->al_count = context->count;
+ aep->a_name[namelen] = 0;
+ alist->al_offset[context->count++] = context->firstu;
+ alist->al_count = context->count;
xfs_attr_trace_l_c("add", context);
return 0;
}
-STATIC int
-xfs_attr_kern_list(xfs_attr_list_context_t *context, attrnames_t *namesp,
- char *name, int namelen,
- int valuelen, char *value)
-{
- char *offset;
- int arraytop;
-
- ASSERT(context->count >= 0);
-
- arraytop = context->count + namesp->attr_namelen + namelen + 1;
- if (arraytop > context->firstu) {
- context->count = -1; /* insufficient space */
- return 1;
- }
- offset = (char *)context->alist + context->count;
- strncpy(offset, namesp->attr_name, namesp->attr_namelen);
- offset += namesp->attr_namelen;
- strncpy(offset, name, namelen); /* real name */
- offset += namelen;
- *offset = '\0';
- context->count += namesp->attr_namelen + namelen + 1;
- return 0;
-}
-
-/*ARGSUSED*/
-STATIC int
-xfs_attr_kern_list_sizes(xfs_attr_list_context_t *context, attrnames_t *namesp,
- char *name, int namelen,
- int valuelen, char *value)
-{
- context->count += namesp->attr_namelen + namelen + 1;
- return 0;
-}
-
/*
* Generate a list of extended attribute names and optionally
* also value lengths. Positive return value follows the XFS
@@ -732,10 +729,9 @@ xfs_attr_list(
attrlist_cursor_kern_t *cursor)
{
xfs_attr_list_context_t context;
+ struct attrlist *alist;
int error;
- XFS_STATS_INC(xs_attr_list);
-
/*
* Validate the cursor.
*/
@@ -756,52 +752,23 @@ xfs_attr_list(
/*
* Initialize the output buffer.
*/
+ memset(&context, 0, sizeof(context));
context.dp = dp;
context.cursor = cursor;
- context.count = 0;
- context.dupcnt = 0;
context.resynch = 1;
context.flags = flags;
- context.seen_enough = 0;
- context.alist = (attrlist_t *)buffer;
- context.put_value = 0;
-
- if (flags & ATTR_KERNAMELS) {
- context.bufsize = bufsize;
- context.firstu = context.bufsize;
- if (flags & ATTR_KERNOVAL)
- context.put_listent = xfs_attr_kern_list_sizes;
- else
- context.put_listent = xfs_attr_kern_list;
- } else {
- context.bufsize = (bufsize & ~(sizeof(int)-1)); /* align */
- context.firstu = context.bufsize;
- context.alist->al_count = 0;
- context.alist->al_more = 0;
- context.alist->al_offset[0] = context.bufsize;
- context.put_listent = xfs_attr_put_listent;
- }
-
- if (XFS_FORCED_SHUTDOWN(dp->i_mount))
- return EIO;
+ context.alist = buffer;
+ context.bufsize = (bufsize & ~(sizeof(int)-1)); /* align */
+ context.firstu = context.bufsize;
+ context.put_listent = xfs_attr_put_listent;
- xfs_ilock(dp, XFS_ILOCK_SHARED);
- xfs_attr_trace_l_c("syscall start", &context);
+ alist = (struct attrlist *)context.alist;
+ alist->al_count = 0;
+ alist->al_more = 0;
+ alist->al_offset[0] = context.bufsize;
error = xfs_attr_list_int(&context);
-
- xfs_iunlock(dp, XFS_ILOCK_SHARED);
- xfs_attr_trace_l_c("syscall end", &context);
-
- if (context.flags & (ATTR_KERNOVAL|ATTR_KERNAMELS)) {
- /* must return negated buffer size or the error */
- if (context.count < 0)
- error = XFS_ERROR(ERANGE);
- else
- error = -context.count;
- } else
- ASSERT(error >= 0);
-
+ ASSERT(error >= 0);
return error;
}
@@ -816,12 +783,10 @@ xfs_attr_inactive(xfs_inode_t *dp)
ASSERT(! XFS_NOT_DQATTACHED(mp, dp));
xfs_ilock(dp, XFS_ILOCK_SHARED);
- if ((XFS_IFORK_Q(dp) == 0) ||
- (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) ||
- (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
- dp->i_d.di_anextents == 0)) {
+ if (!xfs_inode_hasattr(dp) ||
+ dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
xfs_iunlock(dp, XFS_ILOCK_SHARED);
- return(0);
+ return 0;
}
xfs_iunlock(dp, XFS_ILOCK_SHARED);
@@ -854,10 +819,8 @@ xfs_attr_inactive(xfs_inode_t *dp)
/*
* Decide on what work routines to call based on the inode size.
*/
- if ((XFS_IFORK_Q(dp) == 0) ||
- (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) ||
- (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
- dp->i_d.di_anextents == 0)) {
+ if (!xfs_inode_hasattr(dp) ||
+ dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
error = 0;
goto out;
}
@@ -974,7 +937,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
xfs_da_brelse(args->trans, bp);
return(retval);
}
- args->rename = 1; /* an atomic rename */
+ args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */
args->blkno2 = args->blkno; /* set 2nd entry info*/
args->index2 = args->index;
args->rmtblkno2 = args->rmtblkno;
@@ -1019,7 +982,8 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
* Commit the current trans (including the inode) and start
* a new one.
*/
- if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+ error = xfs_trans_roll(&args->trans, dp);
+ if (error)
return (error);
/*
@@ -1033,7 +997,8 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
* Commit the transaction that added the attr name so that
* later routines can manage their own transactions.
*/
- if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+ error = xfs_trans_roll(&args->trans, dp);
+ if (error)
return (error);
/*
@@ -1054,7 +1019,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
* so that one disappears and one appears atomically. Then we
* must remove the "old" attribute/value pair.
*/
- if (args->rename) {
+ if (args->op_flags & XFS_DA_OP_RENAME) {
/*
* In a separate transaction, set the incomplete flag on the
* "old" attr and clear the incomplete flag on the "new" attr.
@@ -1122,7 +1087,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
/*
* Commit the remove and start the next trans in series.
*/
- error = xfs_attr_rolltrans(&args->trans, dp);
+ error = xfs_trans_roll(&args->trans, dp);
} else if (args->rmtblkno > 0) {
/*
@@ -1307,7 +1272,7 @@ restart:
} else if (retval == EEXIST) {
if (args->flags & ATTR_CREATE)
goto out;
- args->rename = 1; /* atomic rename op */
+ args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */
args->blkno2 = args->blkno; /* set 2nd entry info*/
args->index2 = args->index;
args->rmtblkno2 = args->rmtblkno;
@@ -1353,7 +1318,8 @@ restart:
* Commit the node conversion and start the next
* trans in the chain.
*/
- if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+ error = xfs_trans_roll(&args->trans, dp);
+ if (error)
goto out;
goto restart;
@@ -1404,7 +1370,8 @@ restart:
* Commit the leaf addition or btree split and start the next
* trans in the chain.
*/
- if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+ error = xfs_trans_roll(&args->trans, dp);
+ if (error)
goto out;
/*
@@ -1425,7 +1392,7 @@ restart:
* so that one disappears and one appears atomically. Then we
* must remove the "old" attribute/value pair.
*/
- if (args->rename) {
+ if (args->op_flags & XFS_DA_OP_RENAME) {
/*
* In a separate transaction, set the incomplete flag on the
* "old" attr and clear the incomplete flag on the "new" attr.
@@ -1504,7 +1471,8 @@ restart:
/*
* Commit and start the next trans in the chain.
*/
- if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+ error = xfs_trans_roll(&args->trans, dp);
+ if (error)
goto out;
} else if (args->rmtblkno > 0) {
@@ -1636,7 +1604,8 @@ xfs_attr_node_removename(xfs_da_args_t *args)
/*
* Commit the Btree join operation and start a new trans.
*/
- if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+ error = xfs_trans_roll(&args->trans, dp);
+ if (error)
goto out;
}
@@ -2137,7 +2106,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
/*
* Start the next trans in the chain.
*/
- if ((error = xfs_attr_rolltrans(&args->trans, dp)))
+ error = xfs_trans_roll(&args->trans, dp);
+ if (error)
return (error);
}
@@ -2287,7 +2257,8 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
/*
* Close out trans and start the next one in the chain.
*/
- if ((error = xfs_attr_rolltrans(&args->trans, args->dp)))
+ error = xfs_trans_roll(&args->trans, args->dp);
+ if (error)
return (error);
}
return(0);
@@ -2300,23 +2271,7 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
void
xfs_attr_trace_l_c(char *where, struct xfs_attr_list_context *context)
{
- xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_C, where,
- (__psunsigned_t)context->dp,
- (__psunsigned_t)context->cursor->hashval,
- (__psunsigned_t)context->cursor->blkno,
- (__psunsigned_t)context->cursor->offset,
- (__psunsigned_t)context->alist,
- (__psunsigned_t)context->bufsize,
- (__psunsigned_t)context->count,
- (__psunsigned_t)context->firstu,
- (__psunsigned_t)
- ((context->count > 0) &&
- !(context->flags & (ATTR_KERNAMELS|ATTR_KERNOVAL)))
- ? (ATTR_ENTRY(context->alist,
- context->count-1)->a_valuelen)
- : 0,
- (__psunsigned_t)context->dupcnt,
- (__psunsigned_t)context->flags,
+ xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_C, where, context,
(__psunsigned_t)NULL,
(__psunsigned_t)NULL,
(__psunsigned_t)NULL);
@@ -2329,23 +2284,7 @@ void
xfs_attr_trace_l_cn(char *where, struct xfs_attr_list_context *context,
struct xfs_da_intnode *node)
{
- xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CN, where,
- (__psunsigned_t)context->dp,
- (__psunsigned_t)context->cursor->hashval,
- (__psunsigned_t)context->cursor->blkno,
- (__psunsigned_t)context->cursor->offset,
- (__psunsigned_t)context->alist,
- (__psunsigned_t)context->bufsize,
- (__psunsigned_t)context->count,
- (__psunsigned_t)context->firstu,
- (__psunsigned_t)
- ((context->count > 0) &&
- !(context->flags & (ATTR_KERNAMELS|ATTR_KERNOVAL)))
- ? (ATTR_ENTRY(context->alist,
- context->count-1)->a_valuelen)
- : 0,
- (__psunsigned_t)context->dupcnt,
- (__psunsigned_t)context->flags,
+ xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CN, where, context,
(__psunsigned_t)be16_to_cpu(node->hdr.count),
(__psunsigned_t)be32_to_cpu(node->btree[0].hashval),
(__psunsigned_t)be32_to_cpu(node->btree[
@@ -2359,23 +2298,7 @@ void
xfs_attr_trace_l_cb(char *where, struct xfs_attr_list_context *context,
struct xfs_da_node_entry *btree)
{
- xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CB, where,
- (__psunsigned_t)context->dp,
- (__psunsigned_t)context->cursor->hashval,
- (__psunsigned_t)context->cursor->blkno,
- (__psunsigned_t)context->cursor->offset,
- (__psunsigned_t)context->alist,
- (__psunsigned_t)context->bufsize,
- (__psunsigned_t)context->count,
- (__psunsigned_t)context->firstu,
- (__psunsigned_t)
- ((context->count > 0) &&
- !(context->flags & (ATTR_KERNAMELS|ATTR_KERNOVAL)))
- ? (ATTR_ENTRY(context->alist,
- context->count-1)->a_valuelen)
- : 0,
- (__psunsigned_t)context->dupcnt,
- (__psunsigned_t)context->flags,
+ xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CB, where, context,
(__psunsigned_t)be32_to_cpu(btree->hashval),
(__psunsigned_t)be32_to_cpu(btree->before),
(__psunsigned_t)NULL);
@@ -2388,23 +2311,7 @@ void
xfs_attr_trace_l_cl(char *where, struct xfs_attr_list_context *context,
struct xfs_attr_leafblock *leaf)
{
- xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CL, where,
- (__psunsigned_t)context->dp,
- (__psunsigned_t)context->cursor->hashval,
- (__psunsigned_t)context->cursor->blkno,
- (__psunsigned_t)context->cursor->offset,
- (__psunsigned_t)context->alist,
- (__psunsigned_t)context->bufsize,
- (__psunsigned_t)context->count,
- (__psunsigned_t)context->firstu,
- (__psunsigned_t)
- ((context->count > 0) &&
- !(context->flags & (ATTR_KERNAMELS|ATTR_KERNOVAL)))
- ? (ATTR_ENTRY(context->alist,
- context->count-1)->a_valuelen)
- : 0,
- (__psunsigned_t)context->dupcnt,
- (__psunsigned_t)context->flags,
+ xfs_attr_trace_enter(XFS_ATTR_KTRACE_L_CL, where, context,
(__psunsigned_t)be16_to_cpu(leaf->hdr.count),
(__psunsigned_t)be32_to_cpu(leaf->entries[0].hashval),
(__psunsigned_t)be32_to_cpu(leaf->entries[
@@ -2417,329 +2324,24 @@ xfs_attr_trace_l_cl(char *where, struct xfs_attr_list_context *context,
*/
void
xfs_attr_trace_enter(int type, char *where,
- __psunsigned_t a2, __psunsigned_t a3,
- __psunsigned_t a4, __psunsigned_t a5,
- __psunsigned_t a6, __psunsigned_t a7,
- __psunsigned_t a8, __psunsigned_t a9,
- __psunsigned_t a10, __psunsigned_t a11,
- __psunsigned_t a12, __psunsigned_t a13,
- __psunsigned_t a14, __psunsigned_t a15)
+ struct xfs_attr_list_context *context,
+ __psunsigned_t a13, __psunsigned_t a14,
+ __psunsigned_t a15)
{
ASSERT(xfs_attr_trace_buf);
ktrace_enter(xfs_attr_trace_buf, (void *)((__psunsigned_t)type),
- (void *)where,
- (void *)a2, (void *)a3, (void *)a4,
- (void *)a5, (void *)a6, (void *)a7,
- (void *)a8, (void *)a9, (void *)a10,
- (void *)a11, (void *)a12, (void *)a13,
- (void *)a14, (void *)a15);
+ (void *)((__psunsigned_t)where),
+ (void *)((__psunsigned_t)context->dp),
+ (void *)((__psunsigned_t)context->cursor->hashval),
+ (void *)((__psunsigned_t)context->cursor->blkno),
+ (void *)((__psunsigned_t)context->cursor->offset),
+ (void *)((__psunsigned_t)context->alist),
+ (void *)((__psunsigned_t)context->bufsize),
+ (void *)((__psunsigned_t)context->count),
+ (void *)((__psunsigned_t)context->firstu),
+ NULL,
+ (void *)((__psunsigned_t)context->dupcnt),
+ (void *)((__psunsigned_t)context->flags),
+ (void *)a13, (void *)a14, (void *)a15);
}
#endif /* XFS_ATTR_TRACE */
-
-
-/*========================================================================
- * System (pseudo) namespace attribute interface routines.
- *========================================================================*/
-
-STATIC int
-posix_acl_access_set(
- bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
-{
- return xfs_acl_vset(vp, data, size, _ACL_TYPE_ACCESS);
-}
-
-STATIC int
-posix_acl_access_remove(
- bhv_vnode_t *vp, char *name, int xflags)
-{
- return xfs_acl_vremove(vp, _ACL_TYPE_ACCESS);
-}
-
-STATIC int
-posix_acl_access_get(
- bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
-{
- return xfs_acl_vget(vp, data, size, _ACL_TYPE_ACCESS);
-}
-
-STATIC int
-posix_acl_access_exists(
- bhv_vnode_t *vp)
-{
- return xfs_acl_vhasacl_access(vp);
-}
-
-STATIC int
-posix_acl_default_set(
- bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
-{
- return xfs_acl_vset(vp, data, size, _ACL_TYPE_DEFAULT);
-}
-
-STATIC int
-posix_acl_default_get(
- bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
-{
- return xfs_acl_vget(vp, data, size, _ACL_TYPE_DEFAULT);
-}
-
-STATIC int
-posix_acl_default_remove(
- bhv_vnode_t *vp, char *name, int xflags)
-{
- return xfs_acl_vremove(vp, _ACL_TYPE_DEFAULT);
-}
-
-STATIC int
-posix_acl_default_exists(
- bhv_vnode_t *vp)
-{
- return xfs_acl_vhasacl_default(vp);
-}
-
-static struct attrnames posix_acl_access = {
- .attr_name = "posix_acl_access",
- .attr_namelen = sizeof("posix_acl_access") - 1,
- .attr_get = posix_acl_access_get,
- .attr_set = posix_acl_access_set,
- .attr_remove = posix_acl_access_remove,
- .attr_exists = posix_acl_access_exists,
-};
-
-static struct attrnames posix_acl_default = {
- .attr_name = "posix_acl_default",
- .attr_namelen = sizeof("posix_acl_default") - 1,
- .attr_get = posix_acl_default_get,
- .attr_set = posix_acl_default_set,
- .attr_remove = posix_acl_default_remove,
- .attr_exists = posix_acl_default_exists,
-};
-
-static struct attrnames *attr_system_names[] =
- { &posix_acl_access, &posix_acl_default };
-
-
-/*========================================================================
- * Namespace-prefix-style attribute name interface routines.
- *========================================================================*/
-
-STATIC int
-attr_generic_set(
- bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
-{
- return -xfs_attr_set(xfs_vtoi(vp), name, data, size, xflags);
-}
-
-STATIC int
-attr_generic_get(
- bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
-{
- int error, asize = size;
-
- error = xfs_attr_get(xfs_vtoi(vp), name, data, &asize, xflags);
- if (!error)
- return asize;
- return -error;
-}
-
-STATIC int
-attr_generic_remove(
- bhv_vnode_t *vp, char *name, int xflags)
-{
- return -xfs_attr_remove(xfs_vtoi(vp), name, xflags);
-}
-
-STATIC int
-attr_generic_listadd(
- attrnames_t *prefix,
- attrnames_t *namesp,
- void *data,
- size_t size,
- ssize_t *result)
-{
- char *p = data + *result;
-
- *result += prefix->attr_namelen;
- *result += namesp->attr_namelen + 1;
- if (!size)
- return 0;
- if (*result > size)
- return -ERANGE;
- strcpy(p, prefix->attr_name);
- p += prefix->attr_namelen;
- strcpy(p, namesp->attr_name);
- p += namesp->attr_namelen + 1;
- return 0;
-}
-
-STATIC int
-attr_system_list(
- bhv_vnode_t *vp,
- void *data,
- size_t size,
- ssize_t *result)
-{
- attrnames_t *namesp;
- int i, error = 0;
-
- for (i = 0; i < ATTR_SYSCOUNT; i++) {
- namesp = attr_system_names[i];
- if (!namesp->attr_exists || !namesp->attr_exists(vp))
- continue;
- error = attr_generic_listadd(&attr_system, namesp,
- data, size, result);
- if (error)
- break;
- }
- return error;
-}
-
-int
-attr_generic_list(
- bhv_vnode_t *vp, void *data, size_t size, int xflags, ssize_t *result)
-{
- attrlist_cursor_kern_t cursor = { 0 };
- int error;
-
- error = xfs_attr_list(xfs_vtoi(vp), data, size, xflags, &cursor);
- if (error > 0)
- return -error;
- *result = -error;
- return attr_system_list(vp, data, size, result);
-}
-
-attrnames_t *
-attr_lookup_namespace(
- char *name,
- struct attrnames **names,
- int nnames)
-{
- int i;
-
- for (i = 0; i < nnames; i++)
- if (!strncmp(name, names[i]->attr_name, names[i]->attr_namelen))
- return names[i];
- return NULL;
-}
-
-/*
- * Some checks to prevent people abusing EAs to get over quota:
- * - Don't allow modifying user EAs on devices/symlinks;
- * - Don't allow modifying user EAs if sticky bit set;
- */
-STATIC int
-attr_user_capable(
- bhv_vnode_t *vp,
- cred_t *cred)
-{
- struct inode *inode = vn_to_inode(vp);
-
- if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
- return -EPERM;
- if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode) &&
- !capable(CAP_SYS_ADMIN))
- return -EPERM;
- if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) &&
- (current_fsuid(cred) != inode->i_uid) && !capable(CAP_FOWNER))
- return -EPERM;
- return 0;
-}
-
-STATIC int
-attr_trusted_capable(
- bhv_vnode_t *vp,
- cred_t *cred)
-{
- struct inode *inode = vn_to_inode(vp);
-
- if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
- return -EPERM;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
- return 0;
-}
-
-STATIC int
-attr_system_set(
- bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
-{
- attrnames_t *namesp;
- int error;
-
- if (xflags & ATTR_CREATE)
- return -EINVAL;
-
- namesp = attr_lookup_namespace(name, attr_system_names, ATTR_SYSCOUNT);
- if (!namesp)
- return -EOPNOTSUPP;
- error = namesp->attr_set(vp, name, data, size, xflags);
- if (!error)
- error = vn_revalidate(vp);
- return error;
-}
-
-STATIC int
-attr_system_get(
- bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
-{
- attrnames_t *namesp;
-
- namesp = attr_lookup_namespace(name, attr_system_names, ATTR_SYSCOUNT);
- if (!namesp)
- return -EOPNOTSUPP;
- return namesp->attr_get(vp, name, data, size, xflags);
-}
-
-STATIC int
-attr_system_remove(
- bhv_vnode_t *vp, char *name, int xflags)
-{
- attrnames_t *namesp;
-
- namesp = attr_lookup_namespace(name, attr_system_names, ATTR_SYSCOUNT);
- if (!namesp)
- return -EOPNOTSUPP;
- return namesp->attr_remove(vp, name, xflags);
-}
-
-struct attrnames attr_system = {
- .attr_name = "system.",
- .attr_namelen = sizeof("system.") - 1,
- .attr_flag = ATTR_SYSTEM,
- .attr_get = attr_system_get,
- .attr_set = attr_system_set,
- .attr_remove = attr_system_remove,
- .attr_capable = (attrcapable_t)fs_noerr,
-};
-
-struct attrnames attr_trusted = {
- .attr_name = "trusted.",
- .attr_namelen = sizeof("trusted.") - 1,
- .attr_flag = ATTR_ROOT,
- .attr_get = attr_generic_get,
- .attr_set = attr_generic_set,
- .attr_remove = attr_generic_remove,
- .attr_capable = attr_trusted_capable,
-};
-
-struct attrnames attr_secure = {
- .attr_name = "security.",
- .attr_namelen = sizeof("security.") - 1,
- .attr_flag = ATTR_SECURE,
- .attr_get = attr_generic_get,
- .attr_set = attr_generic_set,
- .attr_remove = attr_generic_remove,
- .attr_capable = (attrcapable_t)fs_noerr,
-};
-
-struct attrnames attr_user = {
- .attr_name = "user.",
- .attr_namelen = sizeof("user.") - 1,
- .attr_get = attr_generic_get,
- .attr_set = attr_generic_set,
- .attr_remove = attr_generic_remove,
- .attr_capable = attr_user_capable,
-};
-
-struct attrnames *attr_namespaces[] =
- { &attr_system, &attr_trusted, &attr_secure, &attr_user };
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
index 6cfc9384fe3..fb3b2a68b9b 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -18,9 +18,11 @@
#ifndef __XFS_ATTR_H__
#define __XFS_ATTR_H__
+struct xfs_inode;
+struct xfs_da_args;
+struct xfs_attr_list_context;
+
/*
- * xfs_attr.h
- *
* Large attribute lists are structured around Btrees where all the data
* elements are in the leaf nodes. Attribute names are hashed into an int,
* then that int is used as the index into the Btree. Since the hashval
@@ -35,35 +37,6 @@
* External interfaces
*========================================================================*/
-struct cred;
-struct xfs_attr_list_context;
-
-typedef int (*attrset_t)(bhv_vnode_t *, char *, void *, size_t, int);
-typedef int (*attrget_t)(bhv_vnode_t *, char *, void *, size_t, int);
-typedef int (*attrremove_t)(bhv_vnode_t *, char *, int);
-typedef int (*attrexists_t)(bhv_vnode_t *);
-typedef int (*attrcapable_t)(bhv_vnode_t *, struct cred *);
-
-typedef struct attrnames {
- char * attr_name;
- unsigned int attr_namelen;
- unsigned int attr_flag;
- attrget_t attr_get;
- attrset_t attr_set;
- attrremove_t attr_remove;
- attrexists_t attr_exists;
- attrcapable_t attr_capable;
-} attrnames_t;
-
-#define ATTR_NAMECOUNT 4
-extern struct attrnames attr_user;
-extern struct attrnames attr_secure;
-extern struct attrnames attr_system;
-extern struct attrnames attr_trusted;
-extern struct attrnames *attr_namespaces[ATTR_NAMECOUNT];
-
-extern attrnames_t *attr_lookup_namespace(char *, attrnames_t **, int);
-extern int attr_generic_list(bhv_vnode_t *, void *, size_t, int, ssize_t *);
#define ATTR_DONTFOLLOW 0x0001 /* -- unused, from IRIX -- */
#define ATTR_ROOT 0x0002 /* use attrs in root (trusted) namespace */
@@ -71,16 +44,9 @@ extern int attr_generic_list(bhv_vnode_t *, void *, size_t, int, ssize_t *);
#define ATTR_SECURE 0x0008 /* use attrs in security namespace */
#define ATTR_CREATE 0x0010 /* pure create: fail if attr already exists */
#define ATTR_REPLACE 0x0020 /* pure set: fail if attr does not exist */
-#define ATTR_SYSTEM 0x0100 /* use attrs in system (pseudo) namespace */
-#define ATTR_KERNACCESS 0x0400 /* [kernel] iaccess, inode held io-locked */
#define ATTR_KERNOTIME 0x1000 /* [kernel] don't update inode timestamps */
#define ATTR_KERNOVAL 0x2000 /* [kernel] get attr size only, not value */
-#define ATTR_KERNAMELS 0x4000 /* [kernel] list attr names (simple list) */
-
-#define ATTR_KERNORMALS 0x0800 /* [kernel] normal attr list: user+secure */
-#define ATTR_KERNROOTLS 0x8000 /* [kernel] include root in the attr list */
-#define ATTR_KERNFULLS (ATTR_KERNORMALS|ATTR_KERNROOTLS)
/*
* The maximum size (into the kernel or returned from the kernel) of an
@@ -119,22 +85,6 @@ typedef struct attrlist_ent { /* data from attr_list() */
&((char *)buffer)[ ((attrlist_t *)(buffer))->al_offset[index] ])
/*
- * Multi-attribute operation vector.
- */
-typedef struct attr_multiop {
- int am_opcode; /* operation to perform (ATTR_OP_GET, etc.) */
- int am_error; /* [out arg] result of this sub-op (an errno) */
- char *am_attrname; /* attribute name to work with */
- char *am_attrvalue; /* [in/out arg] attribute value (raw bytes) */
- int am_length; /* [in/out arg] length of value */
- int am_flags; /* bitwise OR of attr API flags defined above */
-} attr_multiop_t;
-
-#define ATTR_OP_GET 1 /* return the indicated attr's value */
-#define ATTR_OP_SET 2 /* set/create the indicated attr/value pair */
-#define ATTR_OP_REMOVE 3 /* remove the indicated attr */
-
-/*
* Kernel-internal version of the attrlist cursor.
*/
typedef struct attrlist_cursor_kern {
@@ -148,20 +98,41 @@ typedef struct attrlist_cursor_kern {
/*========================================================================
- * Function prototypes for the kernel.
+ * Structure used to pass context around among the routines.
*========================================================================*/
-struct xfs_inode;
-struct attrlist_cursor_kern;
-struct xfs_da_args;
+
+typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int,
+ char *, int, int, char *);
+
+typedef struct xfs_attr_list_context {
+ struct xfs_inode *dp; /* inode */
+ struct attrlist_cursor_kern *cursor; /* position in list */
+ char *alist; /* output buffer */
+ int seen_enough; /* T/F: seen enough of list? */
+ ssize_t count; /* num used entries */
+ int dupcnt; /* count dup hashvals seen */
+ int bufsize; /* total buffer size */
+ int firstu; /* first used byte in buffer */
+ int flags; /* from VOP call */
+ int resynch; /* T/F: resynch with cursor */
+ int put_value; /* T/F: need value for listent */
+ put_listent_func_t put_listent; /* list output fmt function */
+ int index; /* index into output buffer */
+} xfs_attr_list_context_t;
+
+
+/*========================================================================
+ * Function prototypes for the kernel.
+ *========================================================================*/
/*
* Overall external interface routines.
*/
+int xfs_attr_calc_size(struct xfs_inode *, int, int, int *);
int xfs_attr_inactive(struct xfs_inode *dp);
-
-int xfs_attr_shortform_getvalue(struct xfs_da_args *);
int xfs_attr_fetch(struct xfs_inode *, struct xfs_name *, char *, int *, int);
int xfs_attr_rmtval_get(struct xfs_da_args *args);
+int xfs_attr_list_int(struct xfs_attr_list_context *);
#endif /* __XFS_ATTR_H__ */
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 303d41e4217..79da6b2ea99 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -94,13 +94,6 @@ STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
* Namespace helper routines
*========================================================================*/
-STATIC_INLINE attrnames_t *
-xfs_attr_flags_namesp(int flags)
-{
- return ((flags & XFS_ATTR_SECURE) ? &attr_secure:
- ((flags & XFS_ATTR_ROOT) ? &attr_trusted : &attr_user));
-}
-
/*
* If namespace bits don't match return 0.
* If all match then return 1.
@@ -111,25 +104,6 @@ xfs_attr_namesp_match(int arg_flags, int ondisk_flags)
return XFS_ATTR_NSP_ONDISK(ondisk_flags) == XFS_ATTR_NSP_ARGS_TO_ONDISK(arg_flags);
}
-/*
- * If namespace bits don't match and we don't have an override for it
- * then return 0.
- * If all match or are overridable then return 1.
- */
-STATIC_INLINE int
-xfs_attr_namesp_match_overrides(int arg_flags, int ondisk_flags)
-{
- if (((arg_flags & ATTR_SECURE) == 0) !=
- ((ondisk_flags & XFS_ATTR_SECURE) == 0) &&
- !(arg_flags & ATTR_KERNORMALS))
- return 0;
- if (((arg_flags & ATTR_ROOT) == 0) !=
- ((ondisk_flags & XFS_ATTR_ROOT) == 0) &&
- !(arg_flags & ATTR_KERNROOTLS))
- return 0;
- return 1;
-}
-
/*========================================================================
* External routines when attribute fork size < XFS_LITINO(mp).
@@ -369,9 +343,10 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
* Fix up the start offset of the attribute fork
*/
totsize -= size;
- if (totsize == sizeof(xfs_attr_sf_hdr_t) && !args->addname &&
- (mp->m_flags & XFS_MOUNT_ATTR2) &&
- (dp->i_d.di_format != XFS_DINODE_FMT_BTREE)) {
+ if (totsize == sizeof(xfs_attr_sf_hdr_t) &&
+ !(args->op_flags & XFS_DA_OP_ADDNAME) &&
+ (mp->m_flags & XFS_MOUNT_ATTR2) &&
+ (dp->i_d.di_format != XFS_DINODE_FMT_BTREE)) {
/*
* Last attribute now removed, revert to original
* inode format making all literal area available
@@ -389,9 +364,10 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize);
ASSERT(dp->i_d.di_forkoff);
- ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) || args->addname ||
- !(mp->m_flags & XFS_MOUNT_ATTR2) ||
- dp->i_d.di_format == XFS_DINODE_FMT_BTREE);
+ ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) ||
+ (args->op_flags & XFS_DA_OP_ADDNAME) ||
+ !(mp->m_flags & XFS_MOUNT_ATTR2) ||
+ dp->i_d.di_format == XFS_DINODE_FMT_BTREE);
dp->i_afp->if_ext_max =
XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
dp->i_df.if_ext_max =
@@ -531,7 +507,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
nargs.total = args->total;
nargs.whichfork = XFS_ATTR_FORK;
nargs.trans = args->trans;
- nargs.oknoent = 1;
+ nargs.op_flags = XFS_DA_OP_OKNOENT;
sfe = &sf->list[0];
for (i = 0; i < sf->hdr.count; i++) {
@@ -555,7 +531,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
out:
if(bp)
xfs_da_buf_done(bp);
- kmem_free(tmpbuffer, size);
+ kmem_free(tmpbuffer);
return(error);
}
@@ -624,15 +600,8 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
(XFS_ISRESET_CURSOR(cursor) &&
(dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) {
for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
- attrnames_t *namesp;
-
- if (!xfs_attr_namesp_match_overrides(context->flags, sfe->flags)) {
- sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
- continue;
- }
- namesp = xfs_attr_flags_namesp(sfe->flags);
error = context->put_listent(context,
- namesp,
+ sfe->flags,
(char *)sfe->nameval,
(int)sfe->namelen,
(int)sfe->valuelen,
@@ -676,13 +645,10 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
XFS_ERRLEVEL_LOW,
context->dp->i_mount, sfe);
xfs_attr_trace_l_c("sf corrupted", context);
- kmem_free(sbuf, sbsize);
+ kmem_free(sbuf);
return XFS_ERROR(EFSCORRUPTED);
}
- if (!xfs_attr_namesp_match_overrides(context->flags, sfe->flags)) {
- sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
- continue;
- }
+
sbp->entno = i;
sbp->hash = xfs_da_hashname((char *)sfe->nameval, sfe->namelen);
sbp->name = (char *)sfe->nameval;
@@ -717,7 +683,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
}
}
if (i == nsbuf) {
- kmem_free(sbuf, sbsize);
+ kmem_free(sbuf);
xfs_attr_trace_l_c("blk end", context);
return(0);
}
@@ -726,16 +692,12 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
* Loop putting entries into the user buffer.
*/
for ( ; i < nsbuf; i++, sbp++) {
- attrnames_t *namesp;
-
- namesp = xfs_attr_flags_namesp(sbp->flags);
-
if (cursor->hashval != sbp->hash) {
cursor->hashval = sbp->hash;
cursor->offset = 0;
}
error = context->put_listent(context,
- namesp,
+ sbp->flags,
sbp->name,
sbp->namelen,
sbp->valuelen,
@@ -747,7 +709,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
cursor->offset++;
}
- kmem_free(sbuf, sbsize);
+ kmem_free(sbuf);
xfs_attr_trace_l_c("sf E-O-F", context);
return(0);
}
@@ -853,7 +815,7 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff)
nargs.total = args->total;
nargs.whichfork = XFS_ATTR_FORK;
nargs.trans = args->trans;
- nargs.oknoent = 1;
+ nargs.op_flags = XFS_DA_OP_OKNOENT;
entry = &leaf->entries[0];
for (i = 0; i < be16_to_cpu(leaf->hdr.count); entry++, i++) {
if (entry->flags & XFS_ATTR_INCOMPLETE)
@@ -873,7 +835,7 @@ xfs_attr_leaf_to_shortform(xfs_dabuf_t *bp, xfs_da_args_t *args, int forkoff)
error = 0;
out:
- kmem_free(tmpbuffer, XFS_LBSIZE(dp->i_mount));
+ kmem_free(tmpbuffer);
return(error);
}
@@ -1155,7 +1117,7 @@ xfs_attr_leaf_add_work(xfs_dabuf_t *bp, xfs_da_args_t *args, int mapindex)
entry->hashval = cpu_to_be32(args->hashval);
entry->flags = tmp ? XFS_ATTR_LOCAL : 0;
entry->flags |= XFS_ATTR_NSP_ARGS_TO_ONDISK(args->flags);
- if (args->rename) {
+ if (args->op_flags & XFS_DA_OP_RENAME) {
entry->flags |= XFS_ATTR_INCOMPLETE;
if ((args->blkno2 == args->blkno) &&
(args->index2 <= args->index)) {
@@ -1271,7 +1233,7 @@ xfs_attr_leaf_compact(xfs_trans_t *trans, xfs_dabuf_t *bp)
be16_to_cpu(hdr_s->count), mp);
xfs_da_log_buf(trans, bp, 0, XFS_LBSIZE(mp) - 1);
- kmem_free(tmpbuffer, XFS_LBSIZE(mp));
+ kmem_free(tmpbuffer);
}
/*
@@ -1921,7 +1883,7 @@ xfs_attr_leaf_unbalance(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
be16_to_cpu(drop_hdr->count), mp);
}
memcpy((char *)save_leaf, (char *)tmp_leaf, state->blocksize);
- kmem_free(tmpbuffer, state->blocksize);
+ kmem_free(tmpbuffer);
}
xfs_da_log_buf(state->args->trans, save_blk->bp, 0,
@@ -2400,8 +2362,6 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
*/
retval = 0;
for ( ; (i < be16_to_cpu(leaf->hdr.count)); entry++, i++) {
- attrnames_t *namesp;
-
if (be32_to_cpu(entry->hashval) != cursor->hashval) {
cursor->hashval = be32_to_cpu(entry->hashval);
cursor->offset = 0;
@@ -2409,17 +2369,13 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
if (entry->flags & XFS_ATTR_INCOMPLETE)
continue; /* skip incomplete entries */
- if (!xfs_attr_namesp_match_overrides(context->flags, entry->flags))
- continue;
-
- namesp = xfs_attr_flags_namesp(entry->flags);
if (entry->flags & XFS_ATTR_LOCAL) {
xfs_attr_leaf_name_local_t *name_loc =
XFS_ATTR_LEAF_NAME_LOCAL(leaf, i);
retval = context->put_listent(context,
- namesp,
+ entry->flags,
(char *)name_loc->nameval,
(int)name_loc->namelen,
be16_to_cpu(name_loc->valuelen),
@@ -2446,16 +2402,15 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
if (retval)
return retval;
retval = context->put_listent(context,
- namesp,
+ entry->flags,
(char *)name_rmt->name,
(int)name_rmt->namelen,
valuelen,
(char*)args.value);
- kmem_free(args.value, valuelen);
- }
- else {
+ kmem_free(args.value);
+ } else {
retval = context->put_listent(context,
- namesp,
+ entry->flags,
(char *)name_rmt->name,
(int)name_rmt->namelen,
valuelen,
@@ -2543,9 +2498,7 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
/*
* Commit the flag value change and start the next trans in series.
*/
- error = xfs_attr_rolltrans(&args->trans, args->dp);
-
- return(error);
+ return xfs_trans_roll(&args->trans, args->dp);
}
/*
@@ -2592,9 +2545,7 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args)
/*
* Commit the flag value change and start the next trans in series.
*/
- error = xfs_attr_rolltrans(&args->trans, args->dp);
-
- return(error);
+ return xfs_trans_roll(&args->trans, args->dp);
}
/*
@@ -2710,7 +2661,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
/*
* Commit the flag value change and start the next trans in series.
*/
- error = xfs_attr_rolltrans(&args->trans, args->dp);
+ error = xfs_trans_roll(&args->trans, args->dp);
return(error);
}
@@ -2768,7 +2719,7 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp)
/*
* Commit the invalidate and start the next transaction.
*/
- error = xfs_attr_rolltrans(trans, dp);
+ error = xfs_trans_roll(trans, dp);
return (error);
}
@@ -2870,7 +2821,8 @@ xfs_attr_node_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp,
/*
* Atomically commit the whole invalidate stuff.
*/
- if ((error = xfs_attr_rolltrans(trans, dp)))
+ error = xfs_trans_roll(trans, dp);
+ if (error)
return (error);
}
@@ -2954,7 +2906,7 @@ xfs_attr_leaf_inactive(xfs_trans_t **trans, xfs_inode_t *dp, xfs_dabuf_t *bp)
error = tmp; /* save only the 1st errno */
}
- kmem_free((xfs_caddr_t)list, size);
+ kmem_free((xfs_caddr_t)list);
return(error);
}
@@ -3009,7 +2961,8 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
/*
* Roll to next transaction.
*/
- if ((error = xfs_attr_rolltrans(trans, dp)))
+ error = xfs_trans_roll(trans, dp);
+ if (error)
return (error);
}
@@ -3019,60 +2972,3 @@ xfs_attr_leaf_freextent(xfs_trans_t **trans, xfs_inode_t *dp,
return(0);
}
-
-
-/*
- * Roll from one trans in the sequence of PERMANENT transactions to the next.
- */
-int
-xfs_attr_rolltrans(xfs_trans_t **transp, xfs_inode_t *dp)
-{
- xfs_trans_t *trans;
- unsigned int logres, count;
- int error;
-
- /*
- * Ensure that the inode is always logged.
- */
- trans = *transp;
- xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
-
- /*
- * Copy the critical parameters from one trans to the next.
- */
- logres = trans->t_log_res;
- count = trans->t_log_count;
- *transp = xfs_trans_dup(trans);
-
- /*
- * Commit the current transaction.
- * If this commit failed, then it'd just unlock those items that
- * are not marked ihold. That also means that a filesystem shutdown
- * is in progress. The caller takes the responsibility to cancel
- * the duplicate transaction that gets returned.
- */
- if ((error = xfs_trans_commit(trans, 0)))
- return (error);
-
- trans = *transp;
-
- /*
- * Reserve space in the log for th next transaction.
- * This also pushes items in the "AIL", the list of logged items,
- * out to disk if they are taking up space at the tail of the log
- * that we want to use. This requires that either nothing be locked
- * across this call, or that anything that is locked be logged in
- * the prior and the next transactions.
- */
- error = xfs_trans_reserve(trans, 0, logres, 0,
- XFS_TRANS_PERM_LOG_RES, count);
- /*
- * Ensure that the inode is in the new transaction and locked.
- */
- if (!error) {
- xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL);
- xfs_trans_ihold(trans, dp);
- }
- return (error);
-
-}
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
index 040f732ce1e..83e9af417ca 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -30,7 +30,7 @@
struct attrlist;
struct attrlist_cursor_kern;
-struct attrnames;
+struct xfs_attr_list_context;
struct xfs_dabuf;
struct xfs_da_args;
struct xfs_da_state;
@@ -204,33 +204,6 @@ static inline int xfs_attr_leaf_entsize_local_max(int bsize)
return (((bsize) >> 1) + ((bsize) >> 2));
}
-
-/*========================================================================
- * Structure used to pass context around among the routines.
- *========================================================================*/
-
-
-struct xfs_attr_list_context;
-
-typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, struct attrnames *,
- char *, int, int, char *);
-
-typedef struct xfs_attr_list_context {
- struct xfs_inode *dp; /* inode */
- struct attrlist_cursor_kern *cursor; /* position in list */
- struct attrlist *alist; /* output buffer */
- int seen_enough; /* T/F: seen enough of list? */
- int count; /* num used entries */
- int dupcnt; /* count dup hashvals seen */
- int bufsize; /* total buffer size */
- int firstu; /* first used byte in buffer */
- int flags; /* from VOP call */
- int resynch; /* T/F: resynch with cursor */
- int put_value; /* T/F: need value for listent */
- put_listent_func_t put_listent; /* list output fmt function */
- int index; /* index into output buffer */
-} xfs_attr_list_context_t;
-
/*
* Used to keep a list of "remote value" extents when unlinking an inode.
*/
@@ -301,6 +274,4 @@ int xfs_attr_leaf_order(struct xfs_dabuf *leaf1_bp,
struct xfs_dabuf *leaf2_bp);
int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize,
int *local);
-int xfs_attr_rolltrans(struct xfs_trans **transp, struct xfs_inode *dp);
-
#endif /* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/xfs_attr_sf.h b/fs/xfs/xfs_attr_sf.h
index f67f917803b..ea22839caed 100644
--- a/fs/xfs/xfs_attr_sf.h
+++ b/fs/xfs/xfs_attr_sf.h
@@ -97,13 +97,9 @@ void xfs_attr_trace_l_cb(char *where, struct xfs_attr_list_context *context,
void xfs_attr_trace_l_cl(char *where, struct xfs_attr_list_context *context,
struct xfs_attr_leafblock *leaf);
void xfs_attr_trace_enter(int type, char *where,
- __psunsigned_t a2, __psunsigned_t a3,
- __psunsigned_t a4, __psunsigned_t a5,
- __psunsigned_t a6, __psunsigned_t a7,
- __psunsigned_t a8, __psunsigned_t a9,
- __psunsigned_t a10, __psunsigned_t a11,
- __psunsigned_t a12, __psunsigned_t a13,
- __psunsigned_t a14, __psunsigned_t a15);
+ struct xfs_attr_list_context *context,
+ __psunsigned_t a13, __psunsigned_t a14,
+ __psunsigned_t a15);
#else
#define xfs_attr_trace_l_c(w,c)
#define xfs_attr_trace_l_cn(w,c,n)
diff --git a/fs/xfs/xfs_bit.c b/fs/xfs/xfs_bit.c
index fab0b6d5a41..48228848f5a 100644
--- a/fs/xfs/xfs_bit.c
+++ b/fs/xfs/xfs_bit.c
@@ -25,109 +25,6 @@
* XFS bit manipulation routines, used in non-realtime code.
*/
-#ifndef HAVE_ARCH_HIGHBIT
-/*
- * Index of high bit number in byte, -1 for none set, 0..7 otherwise.
- */
-static const char xfs_highbit[256] = {
- -1, 0, 1, 1, 2, 2, 2, 2, /* 00 .. 07 */
- 3, 3, 3, 3, 3, 3, 3, 3, /* 08 .. 0f */
- 4, 4, 4, 4, 4, 4, 4, 4, /* 10 .. 17 */
- 4, 4, 4, 4, 4, 4, 4, 4, /* 18 .. 1f */
- 5, 5, 5, 5, 5, 5, 5, 5, /* 20 .. 27 */
- 5, 5, 5, 5, 5, 5, 5, 5, /* 28 .. 2f */
- 5, 5, 5, 5, 5, 5, 5, 5, /* 30 .. 37 */
- 5, 5, 5, 5, 5, 5, 5, 5, /* 38 .. 3f */
- 6, 6, 6, 6, 6, 6, 6, 6, /* 40 .. 47 */
- 6, 6, 6, 6, 6, 6, 6, 6, /* 48 .. 4f */
- 6, 6, 6, 6, 6, 6, 6, 6, /* 50 .. 57 */
- 6, 6, 6, 6, 6, 6, 6, 6, /* 58 .. 5f */
- 6, 6, 6, 6, 6, 6, 6, 6, /* 60 .. 67 */
- 6, 6, 6, 6, 6, 6, 6, 6, /* 68 .. 6f */
- 6, 6, 6, 6, 6, 6, 6, 6, /* 70 .. 77 */
- 6, 6, 6, 6, 6, 6, 6, 6, /* 78 .. 7f */
- 7, 7, 7, 7, 7, 7, 7, 7, /* 80 .. 87 */
- 7, 7, 7, 7, 7, 7, 7, 7, /* 88 .. 8f */
- 7, 7, 7, 7, 7, 7, 7, 7, /* 90 .. 97 */
- 7, 7, 7, 7, 7, 7, 7, 7, /* 98 .. 9f */
- 7, 7, 7, 7, 7, 7, 7, 7, /* a0 .. a7 */
- 7, 7, 7, 7, 7, 7, 7, 7, /* a8 .. af */
- 7, 7, 7, 7, 7, 7, 7, 7, /* b0 .. b7 */
- 7, 7, 7, 7, 7, 7, 7, 7, /* b8 .. bf */
- 7, 7, 7, 7, 7, 7, 7, 7, /* c0 .. c7 */
- 7, 7, 7, 7, 7, 7, 7, 7, /* c8 .. cf */
- 7, 7, 7, 7, 7, 7, 7, 7, /* d0 .. d7 */
- 7, 7, 7, 7, 7, 7, 7, 7, /* d8 .. df */
- 7, 7, 7, 7, 7, 7, 7, 7, /* e0 .. e7 */
- 7, 7, 7, 7, 7, 7, 7, 7, /* e8 .. ef */
- 7, 7, 7, 7, 7, 7, 7, 7, /* f0 .. f7 */
- 7, 7, 7, 7, 7, 7, 7, 7, /* f8 .. ff */
-};
-#endif
-
-/*
- * xfs_highbit32: get high bit set out of 32-bit argument, -1 if none set.
- */
-inline int
-xfs_highbit32(
- __uint32_t v)
-{
-#ifdef HAVE_ARCH_HIGHBIT
- return highbit32(v);
-#else
- int i;
-
- if (v & 0xffff0000)
- if (v & 0xff000000)
- i = 24;
- else
- i = 16;
- else if (v & 0x0000ffff)
- if (v & 0x0000ff00)
- i = 8;
- else
- i = 0;
- else
- return -1;
- return i + xfs_highbit[(v >> i) & 0xff];
-#endif
-}
-
-/*
- * xfs_lowbit64: get low bit set out of 64-bit argument, -1 if none set.
- */
-int
-xfs_lowbit64(
- __uint64_t v)
-{
- __uint32_t w = (__uint32_t)v;
- int n = 0;
-
- if (w) { /* lower bits */
- n = ffs(w);
- } else { /* upper bits */
- w = (__uint32_t)(v >> 32);
- if (w && (n = ffs(w)))
- n += 32;
- }
- return n - 1;
-}
-
-/*
- * xfs_highbit64: get high bit set out of 64-bit argument, -1 if none set.
- */
-int
-xfs_highbit64(
- __uint64_t v)
-{
- __uint32_t h = (__uint32_t)(v >> 32);
-
- if (h)
- return xfs_highbit32(h) + 32;
- return xfs_highbit32((__uint32_t)v);
-}
-
-
/*
* Return whether bitmap is empty.
* Size is number of words in the bitmap, which is padded to word boundary
diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/xfs_bit.h
index 082641a9782..8e0e463dae2 100644
--- a/fs/xfs/xfs_bit.h
+++ b/fs/xfs/xfs_bit.h
@@ -47,13 +47,39 @@ static inline __uint64_t xfs_mask64lo(int n)
}
/* Get high bit set out of 32-bit argument, -1 if none set */
-extern int xfs_highbit32(__uint32_t v);
+static inline int xfs_highbit32(__uint32_t v)
+{
+ return fls(v) - 1;
+}
+
+/* Get high bit set out of 64-bit argument, -1 if none set */
+static inline int xfs_highbit64(__uint64_t v)
+{
+ return fls64(v) - 1;
+}
+
+/* Get low bit set out of 32-bit argument, -1 if none set */
+static inline int xfs_lowbit32(__uint32_t v)
+{
+ unsigned long t = v;
+ return (v) ? find_first_bit(&t, 32) : -1;
+}
/* Get low bit set out of 64-bit argument, -1 if none set */
-extern int xfs_lowbit64(__uint64_t v);
+static inline int xfs_lowbit64(__uint64_t v)
+{
+ __uint32_t w = (__uint32_t)v;
+ int n = 0;
-/* Get high bit set out of 64-bit argument, -1 if none set */
-extern int xfs_highbit64(__uint64_t);
+ if (w) { /* lower bits */
+ n = ffs(w);
+ } else { /* upper bits */
+ w = (__uint32_t)(v >> 32);
+ if (w && (n = ffs(w)))
+ n += 32;
+ }
+ return n - 1;
+}
/* Return whether bitmap is empty (1 == empty) */
extern int xfs_bitmap_empty(uint *map, uint size);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 53c259f5a5a..a1aab9275d5 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -384,14 +384,14 @@ xfs_bmap_count_tree(
int levelin,
int *count);
-STATIC int
+STATIC void
xfs_bmap_count_leaves(
xfs_ifork_t *ifp,
xfs_extnum_t idx,
int numrecs,
int *count);
-STATIC int
+STATIC void
xfs_bmap_disk_count_leaves(
xfs_extnum_t idx,
xfs_bmbt_block_t *block,
@@ -428,7 +428,8 @@ xfs_bmap_add_attrfork_btree(
cur->bc_private.b.firstblock = *firstblock;
if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat)))
goto error0;
- ASSERT(stat == 1); /* must be at least one entry */
+ /* must be at least one entry */
+ XFS_WANT_CORRUPTED_GOTO(stat == 1, error0);
if ((error = xfs_bmbt_newroot(cur, flags, &stat)))
goto error0;
if (stat == 0) {
@@ -816,13 +817,13 @@ xfs_bmap_add_extent_delay_real(
RIGHT.br_startblock,
RIGHT.br_blockcount, &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_delete(cur, &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_decrement(cur, 0, &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
LEFT.br_startblock,
LEFT.br_blockcount +
@@ -860,7 +861,7 @@ xfs_bmap_add_extent_delay_real(
LEFT.br_startblock, LEFT.br_blockcount,
&i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
LEFT.br_startblock,
LEFT.br_blockcount +
@@ -895,7 +896,7 @@ xfs_bmap_add_extent_delay_real(
RIGHT.br_startblock,
RIGHT.br_blockcount, &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
new->br_startblock,
PREV.br_blockcount +
@@ -928,11 +929,11 @@ xfs_bmap_add_extent_delay_real(
new->br_startblock, new->br_blockcount,
&i)))
goto done;
- ASSERT(i == 0);
+ XFS_WANT_CORRUPTED_GOTO(i == 0, done);
cur->bc_rec.b.br_state = XFS_EXT_NORM;
if ((error = xfs_bmbt_insert(cur, &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
*dnew = 0;
/* DELTA: The in-core extent described by new changed type. */
@@ -963,7 +964,7 @@ xfs_bmap_add_extent_delay_real(
LEFT.br_startblock, LEFT.br_blockcount,
&i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
LEFT.br_startblock,
LEFT.br_blockcount +
@@ -1004,11 +1005,11 @@ xfs_bmap_add_extent_delay_real(
new->br_startblock, new->br_blockcount,
&i)))
goto done;
- ASSERT(i == 0);
+ XFS_WANT_CORRUPTED_GOTO(i == 0, done);
cur->bc_rec.b.br_state = XFS_EXT_NORM;
if ((error = xfs_bmbt_insert(cur, &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
ip->i_d.di_nextents > ip->i_df.if_ext_max) {
@@ -1054,7 +1055,7 @@ xfs_bmap_add_extent_delay_real(
RIGHT.br_startblock,
RIGHT.br_blockcount, &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_update(cur, new->br_startoff,
new->br_startblock,
new->br_blockcount +
@@ -1094,11 +1095,11 @@ xfs_bmap_add_extent_delay_real(
new->br_startblock, new->br_blockcount,
&i)))
goto done;
- ASSERT(i == 0);
+ XFS_WANT_CORRUPTED_GOTO(i == 0, done);
cur->bc_rec.b.br_state = XFS_EXT_NORM;
if ((error = xfs_bmbt_insert(cur, &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
ip->i_d.di_nextents > ip->i_df.if_ext_max) {
@@ -1149,11 +1150,11 @@ xfs_bmap_add_extent_delay_real(
new->br_startblock, new->br_blockcount,
&i)))
goto done;
- ASSERT(i == 0);
+ XFS_WANT_CORRUPTED_GOTO(i == 0, done);
cur->bc_rec.b.br_state = XFS_EXT_NORM;
if ((error = xfs_bmbt_insert(cur, &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
ip->i_d.di_nextents > ip->i_df.if_ext_max) {
@@ -1377,19 +1378,19 @@ xfs_bmap_add_extent_unwritten_real(
RIGHT.br_startblock,
RIGHT.br_blockcount, &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_delete(cur, &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_decrement(cur, 0, &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_delete(cur, &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_decrement(cur, 0, &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
LEFT.br_startblock,
LEFT.br_blockcount + PREV.br_blockcount +
@@ -1426,13 +1427,13 @@ xfs_bmap_add_extent_unwritten_real(
PREV.br_startblock, PREV.br_blockcount,
&i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_delete(cur, &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_decrement(cur, 0, &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
LEFT.br_startblock,
LEFT.br_blockcount + PREV.br_blockcount,
@@ -1469,13 +1470,13 @@ xfs_bmap_add_extent_unwritten_real(
RIGHT.br_startblock,
RIGHT.br_blockcount, &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_delete(cur, &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_decrement(cur, 0, &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_update(cur, new->br_startoff,
new->br_startblock,
new->br_blockcount + RIGHT.br_blockcount,
@@ -1508,7 +1509,7 @@ xfs_bmap_add_extent_unwritten_real(
new->br_startblock, new->br_blockcount,
&i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_update(cur, new->br_startoff,
new->br_startblock, new->br_blockcount,
newext)))
@@ -1549,7 +1550,7 @@ xfs_bmap_add_extent_unwritten_real(
PREV.br_startblock, PREV.br_blockcount,
&i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_update(cur,
PREV.br_startoff + new->br_blockcount,
PREV.br_startblock + new->br_blockcount,
@@ -1596,7 +1597,7 @@ xfs_bmap_add_extent_unwritten_real(
PREV.br_startblock, PREV.br_blockcount,
&i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_update(cur,
PREV.br_startoff + new->br_blockcount,
PREV.br_startblock + new->br_blockcount,
@@ -1606,7 +1607,7 @@ xfs_bmap_add_extent_unwritten_real(
cur->bc_rec.b = *new;
if ((error = xfs_bmbt_insert(cur, &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
/* DELTA: One in-core extent is split in two. */
temp = PREV.br_startoff;
@@ -1640,7 +1641,7 @@ xfs_bmap_add_extent_unwritten_real(
PREV.br_startblock,
PREV.br_blockcount, &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
PREV.br_startblock,
PREV.br_blockcount - new->br_blockcount,
@@ -1682,7 +1683,7 @@ xfs_bmap_add_extent_unwritten_real(
PREV.br_startblock, PREV.br_blockcount,
&i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
PREV.br_startblock,
PREV.br_blockcount - new->br_blockcount,
@@ -1692,11 +1693,11 @@ xfs_bmap_add_extent_unwritten_real(
new->br_startblock, new->br_blockcount,
&i)))
goto done;
- ASSERT(i == 0);
+ XFS_WANT_CORRUPTED_GOTO(i == 0, done);
cur->bc_rec.b.br_state = XFS_EXT_NORM;
if ((error = xfs_bmbt_insert(cur, &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
/* DELTA: One in-core extent is split in two. */
temp = PREV.br_startoff;
@@ -1732,27 +1733,34 @@ xfs_bmap_add_extent_unwritten_real(
PREV.br_startblock, PREV.br_blockcount,
&i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
/* new right extent - oldext */
if ((error = xfs_bmbt_update(cur, r[1].br_startoff,
r[1].br_startblock, r[1].br_blockcount,
r[1].br_state)))
goto done;
/* new left extent - oldext */
- PREV.br_blockcount =
- new->br_startoff - PREV.br_startoff;
cur->bc_rec.b = PREV;
+ cur->bc_rec.b.br_blockcount =
+ new->br_startoff - PREV.br_startoff;
if ((error = xfs_bmbt_insert(cur, &i)))
goto done;
- ASSERT(i == 1);
- if ((error = xfs_bmbt_increment(cur, 0, &i)))
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
+ /*
+ * Reset the cursor to the position of the new extent
+ * we are about to insert as we can't trust it after
+ * the previous insert.
+ */
+ if ((error = xfs_bmbt_lookup_eq(cur, new->br_startoff,
+ new->br_startblock, new->br_blockcount,
+ &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 0, done);
/* new middle extent - newext */
- cur->bc_rec.b = *new;
+ cur->bc_rec.b.br_state = new->br_state;
if ((error = xfs_bmbt_insert(cur, &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
/* DELTA: One in-core extent is split in three. */
temp = PREV.br_startoff;
@@ -2097,13 +2105,13 @@ xfs_bmap_add_extent_hole_real(
right.br_startblock,
right.br_blockcount, &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_delete(cur, &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_decrement(cur, 0, &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_update(cur, left.br_startoff,
left.br_startblock,
left.br_blockcount +
@@ -2139,7 +2147,7 @@ xfs_bmap_add_extent_hole_real(
left.br_startblock,
left.br_blockcount, &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_update(cur, left.br_startoff,
left.br_startblock,
left.br_blockcount +
@@ -2174,7 +2182,7 @@ xfs_bmap_add_extent_hole_real(
right.br_startblock,
right.br_blockcount, &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_update(cur, new->br_startoff,
new->br_startblock,
new->br_blockcount +
@@ -2208,11 +2216,11 @@ xfs_bmap_add_extent_hole_real(
new->br_startblock,
new->br_blockcount, &i)))
goto done;
- ASSERT(i == 0);
+ XFS_WANT_CORRUPTED_GOTO(i == 0, done);
cur->bc_rec.b.br_state = new->br_state;
if ((error = xfs_bmbt_insert(cur, &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
/* DELTA: A new extent was added in a hole. */
temp = new->br_startoff;
@@ -3131,7 +3139,7 @@ xfs_bmap_del_extent(
got.br_startblock, got.br_blockcount,
&i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
da_old = da_new = 0;
} else {
@@ -3164,7 +3172,7 @@ xfs_bmap_del_extent(
}
if ((error = xfs_bmbt_delete(cur, &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
break;
case 2:
@@ -3268,7 +3276,7 @@ xfs_bmap_del_extent(
got.br_startblock,
temp, &i)))
goto done;
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
/*
* Update the btree record back
* to the original value.
@@ -3289,7 +3297,7 @@ xfs_bmap_del_extent(
error = XFS_ERROR(ENOSPC);
goto done;
}
- ASSERT(i == 1);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, done);
} else
flags |= XFS_ILOG_FEXT(whichfork);
XFS_IFORK_NEXT_SET(ip, whichfork,
@@ -3992,7 +4000,7 @@ xfs_bmap_add_attrfork(
ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
}
ASSERT(ip->i_d.di_anextents == 0);
- VN_HOLD(XFS_ITOV(ip));
+ IHOLD(ip);
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
switch (ip->i_d.di_format) {
@@ -5970,7 +5978,7 @@ unlock_and_return:
xfs_iunlock_map_shared(ip, lock);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
- kmem_free(map, subnex * sizeof(*map));
+ kmem_free(map);
return error;
}
@@ -6088,7 +6096,7 @@ xfs_bmap_get_bp(
tp = cur->bc_tp;
licp = &tp->t_items;
while (!bp && licp != NULL) {
- if (XFS_LIC_ARE_ALL_FREE(licp)) {
+ if (xfs_lic_are_all_free(licp)) {
licp = licp->lic_next;
continue;
}
@@ -6098,11 +6106,11 @@ xfs_bmap_get_bp(
xfs_buf_log_item_t *bip;
xfs_buf_t *lbp;
- if (XFS_LIC_ISFREE(licp, i)) {
+ if (xfs_lic_isfree(licp, i)) {
continue;
}
- lidp = XFS_LIC_SLOT(licp, i);
+ lidp = xfs_lic_slot(licp, i);
lip = lidp->lid_item;
if (lip->li_type != XFS_LI_BUF)
continue;
@@ -6359,13 +6367,9 @@ xfs_bmap_count_blocks(
mp = ip->i_mount;
ifp = XFS_IFORK_PTR(ip, whichfork);
if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) {
- if (unlikely(xfs_bmap_count_leaves(ifp, 0,
+ xfs_bmap_count_leaves(ifp, 0,
ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t),
- count) < 0)) {
- XFS_ERROR_REPORT("xfs_bmap_count_blocks(1)",
- XFS_ERRLEVEL_LOW, mp);
- return XFS_ERROR(EFSCORRUPTED);
- }
+ count);
return 0;
}
@@ -6446,13 +6450,7 @@ xfs_bmap_count_tree(
for (;;) {
nextbno = be64_to_cpu(block->bb_rightsib);
numrecs = be16_to_cpu(block->bb_numrecs);
- if (unlikely(xfs_bmap_disk_count_leaves(0,
- block, numrecs, count) < 0)) {
- xfs_trans_brelse(tp, bp);
- XFS_ERROR_REPORT("xfs_bmap_count_tree(2)",
- XFS_ERRLEVEL_LOW, mp);
- return XFS_ERROR(EFSCORRUPTED);
- }
+ xfs_bmap_disk_count_leaves(0, block, numrecs, count);
xfs_trans_brelse(tp, bp);
if (nextbno == NULLFSBLOCK)
break;
@@ -6470,7 +6468,7 @@ xfs_bmap_count_tree(
/*
* Count leaf blocks given a range of extent records.
*/
-STATIC int
+STATIC void
xfs_bmap_count_leaves(
xfs_ifork_t *ifp,
xfs_extnum_t idx,
@@ -6483,14 +6481,13 @@ xfs_bmap_count_leaves(
xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b);
*count += xfs_bmbt_get_blockcount(frp);
}
- return 0;
}
/*
* Count leaf blocks given a range of extent records originally
* in btree format.
*/
-STATIC int
+STATIC void
xfs_bmap_disk_count_leaves(
xfs_extnum_t idx,
xfs_bmbt_block_t *block,
@@ -6504,5 +6501,4 @@ xfs_bmap_disk_count_leaves(
frp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, idx + b);
*count += xfs_bmbt_disk_get_blockcount(frp);
}
- return 0;
}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 6ff70cda451..9f3e3a836d1 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -54,12 +54,23 @@ typedef struct xfs_bmap_free_item
/*
* Header for free extent list.
+ *
+ * xbf_low is used by the allocator to activate the lowspace algorithm -
+ * when free space is running low the extent allocator may choose to
+ * allocate an extent from an AG without leaving sufficient space for
+ * a btree split when inserting the new extent. In this case the allocator
+ * will enable the lowspace algorithm which is supposed to allow further
+ * allocations (such as btree splits and newroots) to allocate from
+ * sequential AGs. In order to avoid locking AGs out of order the lowspace
+ * algorithm will start searching for free space from AG 0. If the correct
+ * transaction reservations have been made then this algorithm will eventually
+ * find all the space it needs.
*/
typedef struct xfs_bmap_free
{
xfs_bmap_free_item_t *xbf_first; /* list of to-be-free extents */
int xbf_count; /* count of items on list */
- int xbf_low; /* kludge: alloc in low mode */
+ int xbf_low; /* alloc in low mode */
} xfs_bmap_free_t;
#define XFS_BMAP_MAX_NMAP 4
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 4f0e849d973..23efad29a5c 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -1493,12 +1493,27 @@ xfs_bmbt_split(
left = XFS_BUF_TO_BMBT_BLOCK(lbp);
args.fsbno = cur->bc_private.b.firstblock;
args.firstblock = args.fsbno;
+ args.minleft = 0;
if (args.fsbno == NULLFSBLOCK) {
args.fsbno = lbno;
args.type = XFS_ALLOCTYPE_START_BNO;
- } else
+ /*
+ * Make sure there is sufficient room left in the AG to
+ * complete a full tree split for an extent insert. If
+ * we are converting the middle part of an extent then
+ * we may need space for two tree splits.
+ *
+ * We are relying on the caller to make the correct block
+ * reservation for this operation to succeed. If the
+ * reservation amount is insufficient then we may fail a
+ * block allocation here and corrupt the filesystem.
+ */
+ args.minleft = xfs_trans_get_block_res(args.tp);
+ } else if (cur->bc_private.b.flist->xbf_low)
+ args.type = XFS_ALLOCTYPE_START_BNO;
+ else
args.type = XFS_ALLOCTYPE_NEAR_BNO;
- args.mod = args.minleft = args.alignment = args.total = args.isfl =
+ args.mod = args.alignment = args.total = args.isfl =
args.userdata = args.minalignslop = 0;
args.minlen = args.maxlen = args.prod = 1;
args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
@@ -1510,6 +1525,21 @@ xfs_bmbt_split(
XFS_BMBT_TRACE_CURSOR(cur, ERROR);
return error;
}
+ if (args.fsbno == NULLFSBLOCK && args.minleft) {
+ /*
+ * Could not find an AG with enough free space to satisfy
+ * a full btree split. Try again without minleft and if
+ * successful activate the lowspace algorithm.
+ */
+ args.fsbno = 0;
+ args.type = XFS_ALLOCTYPE_FIRST_AG;
+ args.minleft = 0;
+ if ((error = xfs_alloc_vextent(&args))) {
+ XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+ return error;
+ }
+ cur->bc_private.b.flist->xbf_low = 1;
+ }
if (args.fsbno == NULLFSBLOCK) {
XFS_BMBT_TRACE_CURSOR(cur, EXIT);
*stat = 0;
@@ -2029,22 +2059,8 @@ xfs_bmbt_increment(
* Insert the current record at the point referenced by cur.
*
* A multi-level split of the tree on insert will invalidate the original
- * cursor. It appears, however, that some callers assume that the cursor is
- * always valid. Hence if we do a multi-level split we need to revalidate the
- * cursor.
- *
- * When a split occurs, we will see a new cursor returned. Use that as a
- * trigger to determine if we need to revalidate the original cursor. If we get
- * a split, then use the original irec to lookup up the path of the record we
- * just inserted.
- *
- * Note that the fact that the btree root is in the inode means that we can
- * have the level of the tree change without a "split" occurring at the root
- * level. What happens is that the root is migrated to an allocated block and
- * the inode root is pointed to it. This means a single split can change the
- * level of the tree (level 2 -> level 3) and invalidate the old cursor. Hence
- * the level change should be accounted as a split so as to correctly trigger a
- * revalidation of the old cursor.
+ * cursor. All callers of this function should assume that the cursor is
+ * no longer valid and revalidate it.
*/
int /* error */
xfs_bmbt_insert(
@@ -2057,14 +2073,11 @@ xfs_bmbt_insert(
xfs_fsblock_t nbno;
xfs_btree_cur_t *ncur;
xfs_bmbt_rec_t nrec;
- xfs_bmbt_irec_t oirec; /* original irec */
xfs_btree_cur_t *pcur;
- int splits = 0;
XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
level = 0;
nbno = NULLFSBLOCK;
- oirec = cur->bc_rec.b;
xfs_bmbt_disk_set_all(&nrec, &cur->bc_rec.b);
ncur = NULL;
pcur = cur;
@@ -2073,13 +2086,11 @@ xfs_bmbt_insert(
&i))) {
if (pcur != cur)
xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
- goto error0;
+ XFS_BMBT_TRACE_CURSOR(cur, ERROR);
+ return error;
}
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
if (pcur != cur && (ncur || nbno == NULLFSBLOCK)) {
- /* allocating a new root is effectively a split */
- if (cur->bc_nlevels != pcur->bc_nlevels)
- splits++;
cur->bc_nlevels = pcur->bc_nlevels;
cur->bc_private.b.allocated +=
pcur->bc_private.b.allocated;
@@ -2093,21 +2104,10 @@ xfs_bmbt_insert(
xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
}
if (ncur) {
- splits++;
pcur = ncur;
ncur = NULL;
}
} while (nbno != NULLFSBLOCK);
-
- if (splits > 1) {
- /* revalidate the old cursor as we had a multi-level split */
- error = xfs_bmbt_lookup_eq(cur, oirec.br_startoff,
- oirec.br_startblock, oirec.br_blockcount, &i);
- if (error)
- goto error0;
- ASSERT(i == 1);
- }
-
XFS_BMBT_TRACE_CURSOR(cur, EXIT);
*stat = i;
return 0;
@@ -2254,7 +2254,9 @@ xfs_bmbt_newroot(
#endif
args.fsbno = be64_to_cpu(*pp);
args.type = XFS_ALLOCTYPE_START_BNO;
- } else
+ } else if (cur->bc_private.b.flist->xbf_low)
+ args.type = XFS_ALLOCTYPE_START_BNO;
+ else
args.type = XFS_ALLOCTYPE_NEAR_BNO;
if ((error = xfs_alloc_vextent(&args))) {
XFS_BMBT_TRACE_CURSOR(cur, ERROR);
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index aeb87ca69fc..cc593a84c34 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -46,38 +46,11 @@ kmem_zone_t *xfs_btree_cur_zone;
/*
* Btree magic numbers.
*/
-const __uint32_t xfs_magics[XFS_BTNUM_MAX] =
-{
+const __uint32_t xfs_magics[XFS_BTNUM_MAX] = {
XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC
};
/*
- * Prototypes for internal routines.
- */
-
-/*
- * Checking routine: return maxrecs for the block.
- */
-STATIC int /* number of records fitting in block */
-xfs_btree_maxrecs(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_btree_block_t *block);/* generic btree block pointer */
-
-/*
- * Internal routines.
- */
-
-/*
- * Retrieve the block pointer from the cursor at the given level.
- * This may be a bmap btree root or from a buffer.
- */
-STATIC xfs_btree_block_t * /* generic btree block pointer */
-xfs_btree_get_block(
- xfs_btree_cur_t *cur, /* btree cursor */
- int level, /* level in btree */
- struct xfs_buf **bpp); /* buffer containing the block */
-
-/*
* Checking routine: return maxrecs for the block.
*/
STATIC int /* number of records fitting in block */
@@ -457,35 +430,6 @@ xfs_btree_dup_cursor(
}
/*
- * Change the cursor to point to the first record at the given level.
- * Other levels are unaffected.
- */
-int /* success=1, failure=0 */
-xfs_btree_firstrec(
- xfs_btree_cur_t *cur, /* btree cursor */
- int level) /* level to change */
-{
- xfs_btree_block_t *block; /* generic btree block pointer */
- xfs_buf_t *bp; /* buffer containing block */
-
- /*
- * Get the block pointer for this level.
- */
- block = xfs_btree_get_block(cur, level, &bp);
- xfs_btree_check_block(cur, block, level, bp);
- /*
- * It's empty, there is no such record.
- */
- if (!block->bb_h.bb_numrecs)
- return 0;
- /*
- * Set the ptr value to 1, that's the first record/key.
- */
- cur->bc_ptrs[level] = 1;
- return 1;
-}
-
-/*
* Retrieve the block pointer from the cursor at the given level.
* This may be a bmap btree root or from a buffer.
*/
@@ -626,6 +570,13 @@ xfs_btree_init_cursor(
cur->bc_private.a.agbp = agbp;
cur->bc_private.a.agno = agno;
break;
+ case XFS_BTNUM_INO:
+ /*
+ * Inode allocation btree fields.
+ */
+ cur->bc_private.a.agbp = agbp;
+ cur->bc_private.a.agno = agno;
+ break;
case XFS_BTNUM_BMAP:
/*
* Bmap btree fields.
@@ -638,13 +589,6 @@ xfs_btree_init_cursor(
cur->bc_private.b.flags = 0;
cur->bc_private.b.whichfork = whichfork;
break;
- case XFS_BTNUM_INO:
- /*
- * Inode allocation btree fields.
- */
- cur->bc_private.i.agbp = agbp;
- cur->bc_private.i.agno = agno;
- break;
default:
ASSERT(0);
}
@@ -671,6 +615,35 @@ xfs_btree_islastblock(
}
/*
+ * Change the cursor to point to the first record at the given level.
+ * Other levels are unaffected.
+ */
+int /* success=1, failure=0 */
+xfs_btree_firstrec(
+ xfs_btree_cur_t *cur, /* btree cursor */
+ int level) /* level to change */
+{
+ xfs_btree_block_t *block; /* generic btree block pointer */
+ xfs_buf_t *bp; /* buffer containing block */
+
+ /*
+ * Get the block pointer for this level.
+ */
+ block = xfs_btree_get_block(cur, level, &bp);
+ xfs_btree_check_block(cur, block, level, bp);
+ /*
+ * It's empty, there is no such record.
+ */
+ if (!block->bb_h.bb_numrecs)
+ return 0;
+ /*
+ * Set the ptr value to 1, that's the first record/key.
+ */
+ cur->bc_ptrs[level] = 1;
+ return 1;
+}
+
+/*
* Change the cursor to point to the last record in the current block
* at the given level. Other levels are unaffected.
*/
@@ -890,12 +863,12 @@ xfs_btree_readahead_core(
case XFS_BTNUM_INO:
i = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[lev]);
if ((lr & XFS_BTCUR_LEFTRA) && be32_to_cpu(i->bb_leftsib) != NULLAGBLOCK) {
- xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.i.agno,
+ xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
be32_to_cpu(i->bb_leftsib), 1);
rval++;
}
if ((lr & XFS_BTCUR_RIGHTRA) && be32_to_cpu(i->bb_rightsib) != NULLAGBLOCK) {
- xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.i.agno,
+ xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
be32_to_cpu(i->bb_rightsib), 1);
rval++;
}
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 7440b78f9ce..1f528a2a375 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -158,8 +158,8 @@ typedef struct xfs_btree_cur
__uint8_t bc_blocklog; /* log2(blocksize) of btree blocks */
xfs_btnum_t bc_btnum; /* identifies which btree type */
union {
- struct { /* needed for BNO, CNT */
- struct xfs_buf *agbp; /* agf buffer pointer */
+ struct { /* needed for BNO, CNT, INO */
+ struct xfs_buf *agbp; /* agf/agi buffer pointer */
xfs_agnumber_t agno; /* ag number */
} a;
struct { /* needed for BMAP */
@@ -172,10 +172,6 @@ typedef struct xfs_btree_cur
char flags; /* flags */
#define XFS_BTCUR_BPRV_WASDEL 1 /* was delayed */
} b;
- struct { /* needed for INO */
- struct xfs_buf *agbp; /* agi buffer pointer */
- xfs_agnumber_t agno; /* ag number */
- } i;
} bc_private; /* per-btree type data */
} xfs_btree_cur_t;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 53a71c62025..002fc2617c8 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -732,12 +732,13 @@ xfs_buf_item_init(
bip->bli_item.li_ops = &xfs_buf_item_ops;
bip->bli_item.li_mountp = mp;
bip->bli_buf = bp;
+ xfs_buf_hold(bp);
bip->bli_format.blf_type = XFS_LI_BUF;
bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp);
bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp));
bip->bli_format.blf_map_size = map_size;
#ifdef XFS_BLI_TRACE
- bip->bli_trace = ktrace_alloc(XFS_BLI_TRACE_SIZE, KM_SLEEP);
+ bip->bli_trace = ktrace_alloc(XFS_BLI_TRACE_SIZE, KM_NOFS);
#endif
#ifdef XFS_TRANS_DEBUG
@@ -867,6 +868,21 @@ xfs_buf_item_dirty(
return (bip->bli_flags & XFS_BLI_DIRTY);
}
+STATIC void
+xfs_buf_item_free(
+ xfs_buf_log_item_t *bip)
+{
+#ifdef XFS_TRANS_DEBUG
+ kmem_free(bip->bli_orig);
+ kmem_free(bip->bli_logged);
+#endif /* XFS_TRANS_DEBUG */
+
+#ifdef XFS_BLI_TRACE
+ ktrace_free(bip->bli_trace);
+#endif
+ kmem_zone_free(xfs_buf_item_zone, bip);
+}
+
/*
* This is called when the buf log item is no longer needed. It should
* free the buf log item associated with the given buffer and clear
@@ -887,18 +903,8 @@ xfs_buf_item_relse(
(XFS_BUF_IODONE_FUNC(bp) != NULL)) {
XFS_BUF_CLR_IODONE_FUNC(bp);
}
-
-#ifdef XFS_TRANS_DEBUG
- kmem_free(bip->bli_orig, XFS_BUF_COUNT(bp));
- bip->bli_orig = NULL;
- kmem_free(bip->bli_logged, XFS_BUF_COUNT(bp) / NBBY);
- bip->bli_logged = NULL;
-#endif /* XFS_TRANS_DEBUG */
-
-#ifdef XFS_BLI_TRACE
- ktrace_free(bip->bli_trace);
-#endif
- kmem_zone_free(xfs_buf_item_zone, bip);
+ xfs_buf_rele(bp);
+ xfs_buf_item_free(bip);
}
@@ -1056,7 +1062,7 @@ xfs_buf_iodone_callbacks(
anyway. */
XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse);
XFS_BUF_DONE(bp);
- XFS_BUF_V_IODONESEMA(bp);
+ XFS_BUF_FINISH_IOWAIT(bp);
}
return;
}
@@ -1120,6 +1126,7 @@ xfs_buf_iodone(
ASSERT(bip->bli_buf == bp);
+ xfs_buf_rele(bp);
mp = bip->bli_item.li_mountp;
/*
@@ -1136,18 +1143,7 @@ xfs_buf_iodone(
* xfs_trans_delete_ail() drops the AIL lock.
*/
xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip);
-
-#ifdef XFS_TRANS_DEBUG
- kmem_free(bip->bli_orig, XFS_BUF_COUNT(bp));
- bip->bli_orig = NULL;
- kmem_free(bip->bli_logged, XFS_BUF_COUNT(bp) / NBBY);
- bip->bli_logged = NULL;
-#endif /* XFS_TRANS_DEBUG */
-
-#ifdef XFS_BLI_TRACE
- ktrace_free(bip->bli_trace);
-#endif
- kmem_zone_free(xfs_buf_item_zone, bip);
+ xfs_buf_item_free(bip);
}
#if defined(XFS_BLI_TRACE)
diff --git a/fs/xfs/xfs_clnt.h b/fs/xfs/xfs_clnt.h
index d5d1e60ee22..d2ce5dd70d8 100644
--- a/fs/xfs/xfs_clnt.h
+++ b/fs/xfs/xfs_clnt.h
@@ -78,6 +78,7 @@ struct xfs_mount_args {
#define XFSMNT_IOSIZE 0x00002000 /* optimize for I/O size */
#define XFSMNT_OSYNCISOSYNC 0x00004000 /* o_sync is REALLY o_sync */
/* (osyncisdsync is default) */
+#define XFSMNT_NOATTR2 0x00008000 /* turn off ATTR2 EA format */
#define XFSMNT_32BITINODES 0x00200000 /* restrict inodes to 32
* bits of address space */
#define XFSMNT_GQUOTA 0x00400000 /* group quota accounting */
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 021a8f7e563..9e561a9cefc 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1431,7 +1431,7 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
}
if (level < 0) {
*result = XFS_ERROR(ENOENT); /* we're out of our tree */
- ASSERT(args->oknoent);
+ ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
return(0);
}
@@ -1530,6 +1530,28 @@ xfs_da_hashname(const uchar_t *name, int namelen)
}
}
+enum xfs_dacmp
+xfs_da_compname(
+ struct xfs_da_args *args,
+ const char *name,
+ int len)
+{
+ return (args->namelen == len && memcmp(args->name, name, len) == 0) ?
+ XFS_CMP_EXACT : XFS_CMP_DIFFERENT;
+}
+
+static xfs_dahash_t
+xfs_default_hashname(
+ struct xfs_name *name)
+{
+ return xfs_da_hashname(name->name, name->len);
+}
+
+const struct xfs_nameops xfs_default_nameops = {
+ .hashname = xfs_default_hashname,
+ .compname = xfs_da_compname
+};
+
/*
* Add a block to the btree ahead of the file.
* Return the new block number to the caller.
@@ -1598,7 +1620,7 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
args->firstblock, args->total,
&mapp[mapi], &nmap, args->flist,
NULL))) {
- kmem_free(mapp, sizeof(*mapp) * count);
+ kmem_free(mapp);
return error;
}
if (nmap < 1)
@@ -1620,11 +1642,11 @@ xfs_da_grow_inode(xfs_da_args_t *args, xfs_dablk_t *new_blkno)
mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount !=
bno + count) {
if (mapp != &map)
- kmem_free(mapp, sizeof(*mapp) * count);
+ kmem_free(mapp);
return XFS_ERROR(ENOSPC);
}
if (mapp != &map)
- kmem_free(mapp, sizeof(*mapp) * count);
+ kmem_free(mapp);
*new_blkno = (xfs_dablk_t)bno;
return 0;
}
@@ -2090,10 +2112,10 @@ xfs_da_do_buf(
}
}
if (bplist) {
- kmem_free(bplist, sizeof(*bplist) * nmap);
+ kmem_free(bplist);
}
if (mapp != &map) {
- kmem_free(mapp, sizeof(*mapp) * nfsb);
+ kmem_free(mapp);
}
if (bpp)
*bpp = rbp;
@@ -2102,11 +2124,11 @@ exit1:
if (bplist) {
for (i = 0; i < nbplist; i++)
xfs_trans_brelse(trans, bplist[i]);
- kmem_free(bplist, sizeof(*bplist) * nmap);
+ kmem_free(bplist);
}
exit0:
if (mapp != &map)
- kmem_free(mapp, sizeof(*mapp) * nfsb);
+ kmem_free(mapp);
if (bpp)
*bpp = NULL;
return error;
@@ -2218,7 +2240,7 @@ xfs_da_state_free(xfs_da_state_t *state)
#ifdef XFS_DABUF_DEBUG
xfs_dabuf_t *xfs_dabuf_global_list;
-spinlock_t xfs_dabuf_global_lock;
+static DEFINE_SPINLOCK(xfs_dabuf_global_lock);
#endif
/*
@@ -2315,7 +2337,7 @@ xfs_da_buf_done(xfs_dabuf_t *dabuf)
if (dabuf->dirty)
xfs_da_buf_clean(dabuf);
if (dabuf->nbuf > 1)
- kmem_free(dabuf->data, BBTOB(dabuf->bbcount));
+ kmem_free(dabuf->data);
#ifdef XFS_DABUF_DEBUG
{
spin_lock(&xfs_dabuf_global_lock);
@@ -2332,7 +2354,7 @@ xfs_da_buf_done(xfs_dabuf_t *dabuf)
if (dabuf->nbuf == 1)
kmem_zone_free(xfs_dabuf_zone, dabuf);
else
- kmem_free(dabuf, XFS_DA_BUF_SIZE(dabuf->nbuf));
+ kmem_free(dabuf);
}
/*
@@ -2403,7 +2425,7 @@ xfs_da_brelse(xfs_trans_t *tp, xfs_dabuf_t *dabuf)
for (i = 0; i < nbuf; i++)
xfs_trans_brelse(tp, bplist[i]);
if (bplist != &bp)
- kmem_free(bplist, nbuf * sizeof(*bplist));
+ kmem_free(bplist);
}
/*
@@ -2429,7 +2451,7 @@ xfs_da_binval(xfs_trans_t *tp, xfs_dabuf_t *dabuf)
for (i = 0; i < nbuf; i++)
xfs_trans_binval(tp, bplist[i]);
if (bplist != &bp)
- kmem_free(bplist, nbuf * sizeof(*bplist));
+ kmem_free(bplist);
}
/*
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 7facf86f74f..8be0b00ede9 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -99,6 +99,15 @@ typedef struct xfs_da_node_entry xfs_da_node_entry_t;
*========================================================================*/
/*
+ * Search comparison results
+ */
+enum xfs_dacmp {
+ XFS_CMP_DIFFERENT, /* names are completely different */
+ XFS_CMP_EXACT, /* names are exactly the same */
+ XFS_CMP_CASE /* names are same but differ in case */
+};
+
+/*
* Structure to ease passing around component names.
*/
typedef struct xfs_da_args {
@@ -123,13 +132,20 @@ typedef struct xfs_da_args {
int index2; /* index of 2nd attr in blk */
xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */
int rmtblkcnt2; /* remote attr value block count */
- unsigned char justcheck; /* T/F: check for ok with no space */
- unsigned char rename; /* T/F: this is an atomic rename op */
- unsigned char addname; /* T/F: this is an add operation */
- unsigned char oknoent; /* T/F: ok to return ENOENT, else die */
+ int op_flags; /* operation flags */
+ enum xfs_dacmp cmpresult; /* name compare result for lookups */
} xfs_da_args_t;
/*
+ * Operation flags:
+ */
+#define XFS_DA_OP_JUSTCHECK 0x0001 /* check for ok with no space */
+#define XFS_DA_OP_RENAME 0x0002 /* this is an atomic rename op */
+#define XFS_DA_OP_ADDNAME 0x0004 /* this is an add operation */
+#define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */
+#define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */
+
+/*
* Structure to describe buffer(s) for a block.
* This is needed in the directory version 2 format case, when
* multiple non-contiguous fsblocks might be needed to cover one
@@ -201,6 +217,14 @@ typedef struct xfs_da_state {
(uint)(XFS_DA_LOGOFF(BASE, ADDR)), \
(uint)(XFS_DA_LOGOFF(BASE, ADDR)+(SIZE)-1)
+/*
+ * Name ops for directory and/or attr name operations
+ */
+struct xfs_nameops {
+ xfs_dahash_t (*hashname)(struct xfs_name *);
+ enum xfs_dacmp (*compname)(struct xfs_da_args *, const char *, int);
+};
+
#ifdef __KERNEL__
/*========================================================================
@@ -249,6 +273,10 @@ int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
xfs_dabuf_t *dead_buf);
uint xfs_da_hashname(const uchar_t *name_string, int name_length);
+enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
+ const char *name, int len);
+
+
xfs_da_state_t *xfs_da_state_alloc(void);
void xfs_da_state_free(xfs_da_state_t *state);
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 5f3647cb988..75b0cd4da0e 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -116,7 +116,7 @@ xfs_swapext(
out_put_file:
fput(file);
out_free_sxp:
- kmem_free(sxp, sizeof(xfs_swapext_t));
+ kmem_free(sxp);
out:
return error;
}
@@ -128,10 +128,8 @@ xfs_swap_extents(
xfs_swapext_t *sxp)
{
xfs_mount_t *mp;
- xfs_inode_t *ips[2];
xfs_trans_t *tp;
xfs_bstat_t *sbp = &sxp->sx_stat;
- bhv_vnode_t *vp, *tvp;
xfs_ifork_t *tempifp, *ifp, *tifp;
int ilf_fields, tilf_fields;
static uint lock_flags = XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL;
@@ -150,19 +148,15 @@ xfs_swap_extents(
}
sbp = &sxp->sx_stat;
- vp = XFS_ITOV(ip);
- tvp = XFS_ITOV(tip);
-
- /* Lock in i_ino order */
- if (ip->i_ino < tip->i_ino) {
- ips[0] = ip;
- ips[1] = tip;
- } else {
- ips[0] = tip;
- ips[1] = ip;
- }
- xfs_lock_inodes(ips, 2, lock_flags);
+ /*
+ * we have to do two separate lock calls here to keep lockdep
+ * happy. If we try to get all the locks in one call, lock will
+ * report false positives when we drop the ILOCK and regain them
+ * below.
+ */
+ xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
+ xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
locked = 1;
/* Verify that both files have the same format */
@@ -184,7 +178,7 @@ xfs_swap_extents(
goto error0;
}
- if (VN_CACHED(tvp) != 0) {
+ if (VN_CACHED(VFS_I(tip)) != 0) {
xfs_inval_cached_trace(tip, 0, -1, 0, -1);
error = xfs_flushinval_pages(tip, 0, -1,
FI_REMAPF_LOCKED);
@@ -193,7 +187,7 @@ xfs_swap_extents(
}
/* Verify O_DIRECT for ftmp */
- if (VN_CACHED(tvp) != 0) {
+ if (VN_CACHED(VFS_I(tip)) != 0) {
error = XFS_ERROR(EINVAL);
goto error0;
}
@@ -237,7 +231,7 @@ xfs_swap_extents(
* vop_read (or write in the case of autogrow) they block on the iolock
* until we have switched the extents.
*/
- if (VN_MAPPED(vp)) {
+ if (VN_MAPPED(VFS_I(ip))) {
error = XFS_ERROR(EBUSY);
goto error0;
}
@@ -265,7 +259,7 @@ xfs_swap_extents(
locked = 0;
goto error0;
}
- xfs_lock_inodes(ips, 2, XFS_ILOCK_EXCL);
+ xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
/*
* Count the number of extended attribute blocks
@@ -350,15 +344,11 @@ xfs_swap_extents(
break;
}
- /*
- * Increment vnode ref counts since xfs_trans_commit &
- * xfs_trans_cancel will both unlock the inodes and
- * decrement the associated ref counts.
- */
- VN_HOLD(vp);
- VN_HOLD(tvp);
+ IHOLD(ip);
xfs_trans_ijoin(tp, ip, lock_flags);
+
+ IHOLD(tip);
xfs_trans_ijoin(tp, tip, lock_flags);
xfs_trans_log_inode(tp, ip, ilf_fields);
@@ -381,6 +371,6 @@ xfs_swap_extents(
xfs_iunlock(tip, lock_flags);
}
if (tempifp != NULL)
- kmem_free(tempifp, sizeof(xfs_ifork_t));
+ kmem_free(tempifp);
return error;
}
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index 7cb26529766..80e0dc51361 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -46,6 +46,54 @@
struct xfs_name xfs_name_dotdot = {"..", 2};
+extern const struct xfs_nameops xfs_default_nameops;
+
+/*
+ * ASCII case-insensitive (ie. A-Z) support for directories that was
+ * used in IRIX.
+ */
+STATIC xfs_dahash_t
+xfs_ascii_ci_hashname(
+ struct xfs_name *name)
+{
+ xfs_dahash_t hash;
+ int i;
+
+ for (i = 0, hash = 0; i < name->len; i++)
+ hash = tolower(name->name[i]) ^ rol32(hash, 7);
+
+ return hash;
+}
+
+STATIC enum xfs_dacmp
+xfs_ascii_ci_compname(
+ struct xfs_da_args *args,
+ const char *name,
+ int len)
+{
+ enum xfs_dacmp result;
+ int i;
+
+ if (args->namelen != len)
+ return XFS_CMP_DIFFERENT;
+
+ result = XFS_CMP_EXACT;
+ for (i = 0; i < len; i++) {
+ if (args->name[i] == name[i])
+ continue;
+ if (tolower(args->name[i]) != tolower(name[i]))
+ return XFS_CMP_DIFFERENT;
+ result = XFS_CMP_CASE;
+ }
+
+ return result;
+}
+
+static struct xfs_nameops xfs_ascii_ci_nameops = {
+ .hashname = xfs_ascii_ci_hashname,
+ .compname = xfs_ascii_ci_compname,
+};
+
void
xfs_dir_mount(
xfs_mount_t *mp)
@@ -65,6 +113,10 @@ xfs_dir_mount(
(mp->m_dirblksize - (uint)sizeof(xfs_da_node_hdr_t)) /
(uint)sizeof(xfs_da_node_entry_t);
mp->m_dir_magicpct = (mp->m_dirblksize * 37) / 100;
+ if (xfs_sb_version_hasasciici(&mp->m_sb))
+ mp->m_dirnameops = &xfs_ascii_ci_nameops;
+ else
+ mp->m_dirnameops = &xfs_default_nameops;
}
/*
@@ -162,9 +214,10 @@ xfs_dir_createname(
return rval;
XFS_STATS_INC(xs_dir_create);
+ memset(&args, 0, sizeof(xfs_da_args_t));
args.name = name->name;
args.namelen = name->len;
- args.hashval = xfs_da_hashname(name->name, name->len);
+ args.hashval = dp->i_mount->m_dirnameops->hashname(name);
args.inumber = inum;
args.dp = dp;
args.firstblock = first;
@@ -172,8 +225,7 @@ xfs_dir_createname(
args.total = total;
args.whichfork = XFS_DATA_FORK;
args.trans = tp;
- args.justcheck = 0;
- args.addname = args.oknoent = 1;
+ args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
rval = xfs_dir2_sf_addname(&args);
@@ -191,14 +243,43 @@ xfs_dir_createname(
}
/*
+ * If doing a CI lookup and case-insensitive match, dup actual name into
+ * args.value. Return EEXIST for success (ie. name found) or an error.
+ */
+int
+xfs_dir_cilookup_result(
+ struct xfs_da_args *args,
+ const char *name,
+ int len)
+{
+ if (args->cmpresult == XFS_CMP_DIFFERENT)
+ return ENOENT;
+ if (args->cmpresult != XFS_CMP_CASE ||
+ !(args->op_flags & XFS_DA_OP_CILOOKUP))
+ return EEXIST;
+
+ args->value = kmem_alloc(len, KM_MAYFAIL);
+ if (!args->value)
+ return ENOMEM;
+
+ memcpy(args->value, name, len);
+ args->valuelen = len;
+ return EEXIST;
+}
+
+/*
* Lookup a name in a directory, give back the inode number.
+ * If ci_name is not NULL, returns the actual name in ci_name if it differs
+ * to name, or ci_name->name is set to NULL for an exact match.
*/
+
int
xfs_dir_lookup(
xfs_trans_t *tp,
xfs_inode_t *dp,
struct xfs_name *name,
- xfs_ino_t *inum) /* out: inode number */
+ xfs_ino_t *inum, /* out: inode number */
+ struct xfs_name *ci_name) /* out: actual name if CI match */
{
xfs_da_args_t args;
int rval;
@@ -206,15 +287,17 @@ xfs_dir_lookup(
ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
XFS_STATS_INC(xs_dir_lookup);
- memset(&args, 0, sizeof(xfs_da_args_t));
+ memset(&args, 0, sizeof(xfs_da_args_t));
args.name = name->name;
args.namelen = name->len;
- args.hashval = xfs_da_hashname(name->name, name->len);
+ args.hashval = dp->i_mount->m_dirnameops->hashname(name);
args.dp = dp;
args.whichfork = XFS_DATA_FORK;
args.trans = tp;
- args.oknoent = 1;
+ args.op_flags = XFS_DA_OP_OKNOENT;
+ if (ci_name)
+ args.op_flags |= XFS_DA_OP_CILOOKUP;
if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
rval = xfs_dir2_sf_lookup(&args);
@@ -230,8 +313,13 @@ xfs_dir_lookup(
rval = xfs_dir2_node_lookup(&args);
if (rval == EEXIST)
rval = 0;
- if (rval == 0)
+ if (!rval) {
*inum = args.inumber;
+ if (ci_name) {
+ ci_name->name = args.value;
+ ci_name->len = args.valuelen;
+ }
+ }
return rval;
}
@@ -255,9 +343,10 @@ xfs_dir_removename(
ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
XFS_STATS_INC(xs_dir_remove);
+ memset(&args, 0, sizeof(xfs_da_args_t));
args.name = name->name;
args.namelen = name->len;
- args.hashval = xfs_da_hashname(name->name, name->len);
+ args.hashval = dp->i_mount->m_dirnameops->hashname(name);
args.inumber = ino;
args.dp = dp;
args.firstblock = first;
@@ -265,7 +354,6 @@ xfs_dir_removename(
args.total = total;
args.whichfork = XFS_DATA_FORK;
args.trans = tp;
- args.justcheck = args.addname = args.oknoent = 0;
if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
rval = xfs_dir2_sf_removename(&args);
@@ -338,9 +426,10 @@ xfs_dir_replace(
if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum)))
return rval;
+ memset(&args, 0, sizeof(xfs_da_args_t));
args.name = name->name;
args.namelen = name->len;
- args.hashval = xfs_da_hashname(name->name, name->len);
+ args.hashval = dp->i_mount->m_dirnameops->hashname(name);
args.inumber = inum;
args.dp = dp;
args.firstblock = first;
@@ -348,7 +437,6 @@ xfs_dir_replace(
args.total = total;
args.whichfork = XFS_DATA_FORK;
args.trans = tp;
- args.justcheck = args.addname = args.oknoent = 0;
if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
rval = xfs_dir2_sf_replace(&args);
@@ -384,15 +472,16 @@ xfs_dir_canenter(
return 0;
ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
- memset(&args, 0, sizeof(xfs_da_args_t));
+ memset(&args, 0, sizeof(xfs_da_args_t));
args.name = name->name;
args.namelen = name->len;
- args.hashval = xfs_da_hashname(name->name, name->len);
+ args.hashval = dp->i_mount->m_dirnameops->hashname(name);
args.dp = dp;
args.whichfork = XFS_DATA_FORK;
args.trans = tp;
- args.justcheck = args.addname = args.oknoent = 1;
+ args.op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME |
+ XFS_DA_OP_OKNOENT;
if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
rval = xfs_dir2_sf_addname(&args);
@@ -493,7 +582,7 @@ xfs_dir2_grow_inode(
args->firstblock, args->total,
&mapp[mapi], &nmap, args->flist,
NULL))) {
- kmem_free(mapp, sizeof(*mapp) * count);
+ kmem_free(mapp);
return error;
}
if (nmap < 1)
@@ -525,14 +614,14 @@ xfs_dir2_grow_inode(
mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount !=
bno + count) {
if (mapp != &map)
- kmem_free(mapp, sizeof(*mapp) * count);
+ kmem_free(mapp);
return XFS_ERROR(ENOSPC);
}
/*
* Done with the temporary mapping table.
*/
if (mapp != &map)
- kmem_free(mapp, sizeof(*mapp) * count);
+ kmem_free(mapp);
*dbp = xfs_dir2_da_to_db(mp, (xfs_dablk_t)bno);
/*
* Update file's size if this is the data space and it grew.
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
index 6392f939029..1d9ef96f33a 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/xfs_dir2.h
@@ -74,7 +74,8 @@ extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp,
xfs_fsblock_t *first,
struct xfs_bmap_free *flist, xfs_extlen_t tot);
extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp,
- struct xfs_name *name, xfs_ino_t *inum);
+ struct xfs_name *name, xfs_ino_t *inum,
+ struct xfs_name *ci_name);
extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_name *name, xfs_ino_t ino,
xfs_fsblock_t *first,
@@ -99,4 +100,7 @@ extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp,
extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
struct xfs_dabuf *bp);
+extern int xfs_dir_cilookup_result(struct xfs_da_args *args, const char *name,
+ int len);
+
#endif /* __XFS_DIR2_H__ */
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index fb5a556725b..e2fa0a1d8e9 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -215,7 +215,7 @@ xfs_dir2_block_addname(
/*
* If this isn't a real add, we're done with the buffer.
*/
- if (args->justcheck)
+ if (args->op_flags & XFS_DA_OP_JUSTCHECK)
xfs_da_brelse(tp, bp);
/*
* If we don't have space for the new entry & leaf ...
@@ -225,7 +225,7 @@ xfs_dir2_block_addname(
* Not trying to actually do anything, or don't have
* a space reservation: return no-space.
*/
- if (args->justcheck || args->total == 0)
+ if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
return XFS_ERROR(ENOSPC);
/*
* Convert to the next larger format.
@@ -240,7 +240,7 @@ xfs_dir2_block_addname(
/*
* Just checking, and it would work, so say so.
*/
- if (args->justcheck)
+ if (args->op_flags & XFS_DA_OP_JUSTCHECK)
return 0;
needlog = needscan = 0;
/*
@@ -610,14 +610,15 @@ xfs_dir2_block_lookup(
/*
* Get the offset from the leaf entry, to point to the data.
*/
- dep = (xfs_dir2_data_entry_t *)
- ((char *)block + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address)));
+ dep = (xfs_dir2_data_entry_t *)((char *)block +
+ xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address)));
/*
- * Fill in inode number, release the block.
+ * Fill in inode number, CI name if appropriate, release the block.
*/
args->inumber = be64_to_cpu(dep->inumber);
+ error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
xfs_da_brelse(args->trans, bp);
- return XFS_ERROR(EEXIST);
+ return XFS_ERROR(error);
}
/*
@@ -643,6 +644,7 @@ xfs_dir2_block_lookup_int(
int mid; /* binary search current idx */
xfs_mount_t *mp; /* filesystem mount point */
xfs_trans_t *tp; /* transaction pointer */
+ enum xfs_dacmp cmp; /* comparison result */
dp = args->dp;
tp = args->trans;
@@ -673,7 +675,7 @@ xfs_dir2_block_lookup_int(
else
high = mid - 1;
if (low > high) {
- ASSERT(args->oknoent);
+ ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
xfs_da_brelse(tp, bp);
return XFS_ERROR(ENOENT);
}
@@ -697,20 +699,31 @@ xfs_dir2_block_lookup_int(
dep = (xfs_dir2_data_entry_t *)
((char *)block + xfs_dir2_dataptr_to_off(mp, addr));
/*
- * Compare, if it's right give back buffer & entry number.
+ * Compare name and if it's an exact match, return the index
+ * and buffer. If it's the first case-insensitive match, store
+ * the index and buffer and continue looking for an exact match.
*/
- if (dep->namelen == args->namelen &&
- dep->name[0] == args->name[0] &&
- memcmp(dep->name, args->name, args->namelen) == 0) {
+ cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
+ if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+ args->cmpresult = cmp;
*bpp = bp;
*entno = mid;
- return 0;
+ if (cmp == XFS_CMP_EXACT)
+ return 0;
}
- } while (++mid < be32_to_cpu(btp->count) && be32_to_cpu(blp[mid].hashval) == hash);
+ } while (++mid < be32_to_cpu(btp->count) &&
+ be32_to_cpu(blp[mid].hashval) == hash);
+
+ ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+ /*
+ * Here, we can only be doing a lookup (not a rename or replace).
+ * If a case-insensitive match was found earlier, return success.
+ */
+ if (args->cmpresult == XFS_CMP_CASE)
+ return 0;
/*
* No match, release the buffer and return ENOENT.
*/
- ASSERT(args->oknoent);
xfs_da_brelse(tp, bp);
return XFS_ERROR(ENOENT);
}
@@ -1033,6 +1046,7 @@ xfs_dir2_sf_to_block(
xfs_dir2_sf_t *sfp; /* shortform structure */
__be16 *tagp; /* end of data entry */
xfs_trans_t *tp; /* transaction pointer */
+ struct xfs_name name;
xfs_dir2_trace_args("sf_to_block", args);
dp = args->dp;
@@ -1071,7 +1085,7 @@ xfs_dir2_sf_to_block(
*/
error = xfs_dir2_grow_inode(args, XFS_DIR2_DATA_SPACE, &blkno);
if (error) {
- kmem_free(buf, buf_len);
+ kmem_free(buf);
return error;
}
/*
@@ -1079,7 +1093,7 @@ xfs_dir2_sf_to_block(
*/
error = xfs_dir2_data_init(args, blkno, &bp);
if (error) {
- kmem_free(buf, buf_len);
+ kmem_free(buf);
return error;
}
block = bp->data;
@@ -1187,8 +1201,10 @@ xfs_dir2_sf_to_block(
tagp = xfs_dir2_data_entry_tag_p(dep);
*tagp = cpu_to_be16((char *)dep - (char *)block);
xfs_dir2_data_log_entry(tp, bp, dep);
- blp[2 + i].hashval = cpu_to_be32(xfs_da_hashname(
- (char *)sfep->name, sfep->namelen));
+ name.name = sfep->name;
+ name.len = sfep->namelen;
+ blp[2 + i].hashval = cpu_to_be32(mp->m_dirnameops->
+ hashname(&name));
blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp,
(char *)dep - (char *)block));
offset = (int)((char *)(tagp + 1) - (char *)block);
@@ -1198,7 +1214,7 @@ xfs_dir2_sf_to_block(
sfep = xfs_dir2_sf_nextentry(sfp, sfep);
}
/* Done with the temporary buffer */
- kmem_free(buf, buf_len);
+ kmem_free(buf);
/*
* Sort the leaf entries by hash value.
*/
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index fb8c9e08b23..498f8d69433 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -65,6 +65,7 @@ xfs_dir2_data_check(
xfs_mount_t *mp; /* filesystem mount point */
char *p; /* current data position */
int stale; /* count of stale leaves */
+ struct xfs_name name;
mp = dp->i_mount;
d = bp->data;
@@ -140,7 +141,9 @@ xfs_dir2_data_check(
addr = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
(xfs_dir2_data_aoff_t)
((char *)dep - (char *)d));
- hash = xfs_da_hashname((char *)dep->name, dep->namelen);
+ name.name = dep->name;
+ name.len = dep->namelen;
+ hash = mp->m_dirnameops->hashname(&name);
for (i = 0; i < be32_to_cpu(btp->count); i++) {
if (be32_to_cpu(lep[i].address) == addr &&
be32_to_cpu(lep[i].hashval) == hash)
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index bc52b803d79..93535992cb6 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -263,20 +263,21 @@ xfs_dir2_leaf_addname(
* If we don't have enough free bytes but we can make enough
* by compacting out stale entries, we'll do that.
*/
- if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] < needbytes &&
- be16_to_cpu(leaf->hdr.stale) > 1) {
+ if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] <
+ needbytes && be16_to_cpu(leaf->hdr.stale) > 1) {
compact = 1;
}
/*
* Otherwise if we don't have enough free bytes we need to
* convert to node form.
*/
- else if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu(leaf->hdr.count)] <
- needbytes) {
+ else if ((char *)bestsp - (char *)&leaf->ents[be16_to_cpu(
+ leaf->hdr.count)] < needbytes) {
/*
* Just checking or no space reservation, give up.
*/
- if (args->justcheck || args->total == 0) {
+ if ((args->op_flags & XFS_DA_OP_JUSTCHECK) ||
+ args->total == 0) {
xfs_da_brelse(tp, lbp);
return XFS_ERROR(ENOSPC);
}
@@ -301,7 +302,7 @@ xfs_dir2_leaf_addname(
* If just checking, then it will fit unless we needed to allocate
* a new data block.
*/
- if (args->justcheck) {
+ if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
xfs_da_brelse(tp, lbp);
return use_block == -1 ? XFS_ERROR(ENOSPC) : 0;
}
@@ -1110,7 +1111,7 @@ xfs_dir2_leaf_getdents(
*offset = XFS_DIR2_MAX_DATAPTR;
else
*offset = xfs_dir2_byte_to_dataptr(mp, curoff);
- kmem_free(map, map_size * sizeof(*map));
+ kmem_free(map);
if (bp)
xfs_da_brelse(NULL, bp);
return error;
@@ -1298,12 +1299,13 @@ xfs_dir2_leaf_lookup(
((char *)dbp->data +
xfs_dir2_dataptr_to_off(dp->i_mount, be32_to_cpu(lep->address)));
/*
- * Return the found inode number.
+ * Return the found inode number & CI name if appropriate
*/
args->inumber = be64_to_cpu(dep->inumber);
+ error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
xfs_da_brelse(tp, dbp);
xfs_da_brelse(tp, lbp);
- return XFS_ERROR(EEXIST);
+ return XFS_ERROR(error);
}
/*
@@ -1319,8 +1321,8 @@ xfs_dir2_leaf_lookup_int(
int *indexp, /* out: index in leaf block */
xfs_dabuf_t **dbpp) /* out: data buffer */
{
- xfs_dir2_db_t curdb; /* current data block number */
- xfs_dabuf_t *dbp; /* data buffer */
+ xfs_dir2_db_t curdb = -1; /* current data block number */
+ xfs_dabuf_t *dbp = NULL; /* data buffer */
xfs_dir2_data_entry_t *dep; /* data entry */
xfs_inode_t *dp; /* incore directory inode */
int error; /* error return code */
@@ -1331,6 +1333,8 @@ xfs_dir2_leaf_lookup_int(
xfs_mount_t *mp; /* filesystem mount point */
xfs_dir2_db_t newdb; /* new data block number */
xfs_trans_t *tp; /* transaction pointer */
+ xfs_dir2_db_t cidb = -1; /* case match data block no. */
+ enum xfs_dacmp cmp; /* name compare result */
dp = args->dp;
tp = args->trans;
@@ -1338,11 +1342,10 @@ xfs_dir2_leaf_lookup_int(
/*
* Read the leaf block into the buffer.
*/
- if ((error =
- xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
- XFS_DATA_FORK))) {
+ error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
+ XFS_DATA_FORK);
+ if (error)
return error;
- }
*lbpp = lbp;
leaf = lbp->data;
xfs_dir2_leaf_check(dp, lbp);
@@ -1354,9 +1357,9 @@ xfs_dir2_leaf_lookup_int(
* Loop over all the entries with the right hash value
* looking to match the name.
*/
- for (lep = &leaf->ents[index], dbp = NULL, curdb = -1;
- index < be16_to_cpu(leaf->hdr.count) && be32_to_cpu(lep->hashval) == args->hashval;
- lep++, index++) {
+ for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) &&
+ be32_to_cpu(lep->hashval) == args->hashval;
+ lep++, index++) {
/*
* Skip over stale leaf entries.
*/
@@ -1373,10 +1376,10 @@ xfs_dir2_leaf_lookup_int(
if (newdb != curdb) {
if (dbp)
xfs_da_brelse(tp, dbp);
- if ((error =
- xfs_da_read_buf(tp, dp,
- xfs_dir2_db_to_da(mp, newdb), -1, &dbp,
- XFS_DATA_FORK))) {
+ error = xfs_da_read_buf(tp, dp,
+ xfs_dir2_db_to_da(mp, newdb),
+ -1, &dbp, XFS_DATA_FORK);
+ if (error) {
xfs_da_brelse(tp, lbp);
return error;
}
@@ -1386,24 +1389,50 @@ xfs_dir2_leaf_lookup_int(
/*
* Point to the data entry.
*/
- dep = (xfs_dir2_data_entry_t *)
- ((char *)dbp->data +
- xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)));
+ dep = (xfs_dir2_data_entry_t *)((char *)dbp->data +
+ xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)));
/*
- * If it matches then return it.
+ * Compare name and if it's an exact match, return the index
+ * and buffer. If it's the first case-insensitive match, store
+ * the index and buffer and continue looking for an exact match.
*/
- if (dep->namelen == args->namelen &&
- dep->name[0] == args->name[0] &&
- memcmp(dep->name, args->name, args->namelen) == 0) {
- *dbpp = dbp;
+ cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
+ if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+ args->cmpresult = cmp;
*indexp = index;
- return 0;
+ /* case exact match: return the current buffer. */
+ if (cmp == XFS_CMP_EXACT) {
+ *dbpp = dbp;
+ return 0;
+ }
+ cidb = curdb;
}
}
+ ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+ /*
+ * Here, we can only be doing a lookup (not a rename or remove).
+ * If a case-insensitive match was found earlier, re-read the
+ * appropriate data block if required and return it.
+ */
+ if (args->cmpresult == XFS_CMP_CASE) {
+ ASSERT(cidb != -1);
+ if (cidb != curdb) {
+ xfs_da_brelse(tp, dbp);
+ error = xfs_da_read_buf(tp, dp,
+ xfs_dir2_db_to_da(mp, cidb),
+ -1, &dbp, XFS_DATA_FORK);
+ if (error) {
+ xfs_da_brelse(tp, lbp);
+ return error;
+ }
+ }
+ *dbpp = dbp;
+ return 0;
+ }
/*
* No match found, return ENOENT.
*/
- ASSERT(args->oknoent);
+ ASSERT(cidb == -1);
if (dbp)
xfs_da_brelse(tp, dbp);
xfs_da_brelse(tp, lbp);
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 8dade711f09..fa6c3a5ddbc 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -226,7 +226,7 @@ xfs_dir2_leafn_add(
ASSERT(index == be16_to_cpu(leaf->hdr.count) ||
be32_to_cpu(leaf->ents[index].hashval) >= args->hashval);
- if (args->justcheck)
+ if (args->op_flags & XFS_DA_OP_JUSTCHECK)
return 0;
/*
@@ -387,28 +387,26 @@ xfs_dir2_leafn_lasthash(
}
/*
- * Look up a leaf entry in a node-format leaf block.
- * If this is an addname then the extrablk in state is a freespace block,
- * otherwise it's a data block.
+ * Look up a leaf entry for space to add a name in a node-format leaf block.
+ * The extrablk in state is a freespace block.
*/
-int
-xfs_dir2_leafn_lookup_int(
+STATIC int
+xfs_dir2_leafn_lookup_for_addname(
xfs_dabuf_t *bp, /* leaf buffer */
xfs_da_args_t *args, /* operation arguments */
int *indexp, /* out: leaf entry index */
xfs_da_state_t *state) /* state to fill in */
{
- xfs_dabuf_t *curbp; /* current data/free buffer */
- xfs_dir2_db_t curdb; /* current data block number */
- xfs_dir2_db_t curfdb; /* current free block number */
- xfs_dir2_data_entry_t *dep; /* data block entry */
+ xfs_dabuf_t *curbp = NULL; /* current data/free buffer */
+ xfs_dir2_db_t curdb = -1; /* current data block number */
+ xfs_dir2_db_t curfdb = -1; /* current free block number */
xfs_inode_t *dp; /* incore directory inode */
int error; /* error return value */
int fi; /* free entry index */
- xfs_dir2_free_t *free=NULL; /* free block structure */
+ xfs_dir2_free_t *free = NULL; /* free block structure */
int index; /* leaf entry index */
xfs_dir2_leaf_t *leaf; /* leaf structure */
- int length=0; /* length of new data entry */
+ int length; /* length of new data entry */
xfs_dir2_leaf_entry_t *lep; /* leaf entry */
xfs_mount_t *mp; /* filesystem mount point */
xfs_dir2_db_t newdb; /* new data block number */
@@ -431,33 +429,20 @@ xfs_dir2_leafn_lookup_int(
/*
* Do we have a buffer coming in?
*/
- if (state->extravalid)
+ if (state->extravalid) {
+ /* If so, it's a free block buffer, get the block number. */
curbp = state->extrablk.bp;
- else
- curbp = NULL;
- /*
- * For addname, it's a free block buffer, get the block number.
- */
- if (args->addname) {
- curfdb = curbp ? state->extrablk.blkno : -1;
- curdb = -1;
- length = xfs_dir2_data_entsize(args->namelen);
- if ((free = (curbp ? curbp->data : NULL)))
- ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC);
- }
- /*
- * For others, it's a data block buffer, get the block number.
- */
- else {
- curfdb = -1;
- curdb = curbp ? state->extrablk.blkno : -1;
+ curfdb = state->extrablk.blkno;
+ free = curbp->data;
+ ASSERT(be32_to_cpu(free->hdr.magic) == XFS_DIR2_FREE_MAGIC);
}
+ length = xfs_dir2_data_entsize(args->namelen);
/*
* Loop over leaf entries with the right hash value.
*/
- for (lep = &leaf->ents[index];
- index < be16_to_cpu(leaf->hdr.count) && be32_to_cpu(lep->hashval) == args->hashval;
- lep++, index++) {
+ for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) &&
+ be32_to_cpu(lep->hashval) == args->hashval;
+ lep++, index++) {
/*
* Skip stale leaf entries.
*/
@@ -471,161 +456,244 @@ xfs_dir2_leafn_lookup_int(
* For addname, we're looking for a place to put the new entry.
* We want to use a data block with an entry of equal
* hash value to ours if there is one with room.
+ *
+ * If this block isn't the data block we already have
+ * in hand, take a look at it.
*/
- if (args->addname) {
+ if (newdb != curdb) {
+ curdb = newdb;
/*
- * If this block isn't the data block we already have
- * in hand, take a look at it.
+ * Convert the data block to the free block
+ * holding its freespace information.
*/
- if (newdb != curdb) {
- curdb = newdb;
- /*
- * Convert the data block to the free block
- * holding its freespace information.
- */
- newfdb = xfs_dir2_db_to_fdb(mp, newdb);
- /*
- * If it's not the one we have in hand,
- * read it in.
- */
- if (newfdb != curfdb) {
- /*
- * If we had one before, drop it.
- */
- if (curbp)
- xfs_da_brelse(tp, curbp);
- /*
- * Read the free block.
- */
- if ((error = xfs_da_read_buf(tp, dp,
- xfs_dir2_db_to_da(mp,
- newfdb),
- -1, &curbp,
- XFS_DATA_FORK))) {
- return error;
- }
- free = curbp->data;
- ASSERT(be32_to_cpu(free->hdr.magic) ==
- XFS_DIR2_FREE_MAGIC);
- ASSERT((be32_to_cpu(free->hdr.firstdb) %
- XFS_DIR2_MAX_FREE_BESTS(mp)) ==
- 0);
- ASSERT(be32_to_cpu(free->hdr.firstdb) <= curdb);
- ASSERT(curdb <
- be32_to_cpu(free->hdr.firstdb) +
- be32_to_cpu(free->hdr.nvalid));
- }
- /*
- * Get the index for our entry.
- */
- fi = xfs_dir2_db_to_fdindex(mp, curdb);
- /*
- * If it has room, return it.
- */
- if (unlikely(be16_to_cpu(free->bests[fi]) == NULLDATAOFF)) {
- XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int",
- XFS_ERRLEVEL_LOW, mp);
- if (curfdb != newfdb)
- xfs_da_brelse(tp, curbp);
- return XFS_ERROR(EFSCORRUPTED);
- }
- curfdb = newfdb;
- if (be16_to_cpu(free->bests[fi]) >= length) {
- *indexp = index;
- state->extravalid = 1;
- state->extrablk.bp = curbp;
- state->extrablk.blkno = curfdb;
- state->extrablk.index = fi;
- state->extrablk.magic =
- XFS_DIR2_FREE_MAGIC;
- ASSERT(args->oknoent);
- return XFS_ERROR(ENOENT);
- }
- }
- }
- /*
- * Not adding a new entry, so we really want to find
- * the name given to us.
- */
- else {
+ newfdb = xfs_dir2_db_to_fdb(mp, newdb);
/*
- * If it's a different data block, go get it.
+ * If it's not the one we have in hand, read it in.
*/
- if (newdb != curdb) {
+ if (newfdb != curfdb) {
/*
- * If we had a block before, drop it.
+ * If we had one before, drop it.
*/
if (curbp)
xfs_da_brelse(tp, curbp);
/*
- * Read the data block.
+ * Read the free block.
*/
- if ((error =
- xfs_da_read_buf(tp, dp,
- xfs_dir2_db_to_da(mp, newdb), -1,
- &curbp, XFS_DATA_FORK))) {
+ error = xfs_da_read_buf(tp, dp,
+ xfs_dir2_db_to_da(mp, newfdb),
+ -1, &curbp, XFS_DATA_FORK);
+ if (error)
return error;
- }
- xfs_dir2_data_check(dp, curbp);
- curdb = newdb;
+ free = curbp->data;
+ ASSERT(be32_to_cpu(free->hdr.magic) ==
+ XFS_DIR2_FREE_MAGIC);
+ ASSERT((be32_to_cpu(free->hdr.firstdb) %
+ XFS_DIR2_MAX_FREE_BESTS(mp)) == 0);
+ ASSERT(be32_to_cpu(free->hdr.firstdb) <= curdb);
+ ASSERT(curdb < be32_to_cpu(free->hdr.firstdb) +
+ be32_to_cpu(free->hdr.nvalid));
}
/*
- * Point to the data entry.
+ * Get the index for our entry.
*/
- dep = (xfs_dir2_data_entry_t *)
- ((char *)curbp->data +
- xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)));
+ fi = xfs_dir2_db_to_fdindex(mp, curdb);
/*
- * Compare the entry, return it if it matches.
+ * If it has room, return it.
*/
- if (dep->namelen == args->namelen &&
- dep->name[0] == args->name[0] &&
- memcmp(dep->name, args->name, args->namelen) == 0) {
- args->inumber = be64_to_cpu(dep->inumber);
- *indexp = index;
- state->extravalid = 1;
- state->extrablk.bp = curbp;
- state->extrablk.blkno = curdb;
- state->extrablk.index =
- (int)((char *)dep -
- (char *)curbp->data);
- state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
- return XFS_ERROR(EEXIST);
+ if (unlikely(be16_to_cpu(free->bests[fi]) == NULLDATAOFF)) {
+ XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int",
+ XFS_ERRLEVEL_LOW, mp);
+ if (curfdb != newfdb)
+ xfs_da_brelse(tp, curbp);
+ return XFS_ERROR(EFSCORRUPTED);
}
+ curfdb = newfdb;
+ if (be16_to_cpu(free->bests[fi]) >= length)
+ goto out;
}
}
+ /* Didn't find any space */
+ fi = -1;
+out:
+ ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
+ if (curbp) {
+ /* Giving back a free block. */
+ state->extravalid = 1;
+ state->extrablk.bp = curbp;
+ state->extrablk.index = fi;
+ state->extrablk.blkno = curfdb;
+ state->extrablk.magic = XFS_DIR2_FREE_MAGIC;
+ } else {
+ state->extravalid = 0;
+ }
/*
- * Didn't find a match.
- * If we are holding a buffer, give it back in case our caller
- * finds it useful.
+ * Return the index, that will be the insertion point.
*/
- if ((state->extravalid = (curbp != NULL))) {
- state->extrablk.bp = curbp;
- state->extrablk.index = -1;
+ *indexp = index;
+ return XFS_ERROR(ENOENT);
+}
+
+/*
+ * Look up a leaf entry in a node-format leaf block.
+ * The extrablk in state a data block.
+ */
+STATIC int
+xfs_dir2_leafn_lookup_for_entry(
+ xfs_dabuf_t *bp, /* leaf buffer */
+ xfs_da_args_t *args, /* operation arguments */
+ int *indexp, /* out: leaf entry index */
+ xfs_da_state_t *state) /* state to fill in */
+{
+ xfs_dabuf_t *curbp = NULL; /* current data/free buffer */
+ xfs_dir2_db_t curdb = -1; /* current data block number */
+ xfs_dir2_data_entry_t *dep; /* data block entry */
+ xfs_inode_t *dp; /* incore directory inode */
+ int error; /* error return value */
+ int index; /* leaf entry index */
+ xfs_dir2_leaf_t *leaf; /* leaf structure */
+ xfs_dir2_leaf_entry_t *lep; /* leaf entry */
+ xfs_mount_t *mp; /* filesystem mount point */
+ xfs_dir2_db_t newdb; /* new data block number */
+ xfs_trans_t *tp; /* transaction pointer */
+ enum xfs_dacmp cmp; /* comparison result */
+
+ dp = args->dp;
+ tp = args->trans;
+ mp = dp->i_mount;
+ leaf = bp->data;
+ ASSERT(be16_to_cpu(leaf->hdr.info.magic) == XFS_DIR2_LEAFN_MAGIC);
+#ifdef __KERNEL__
+ ASSERT(be16_to_cpu(leaf->hdr.count) > 0);
+#endif
+ xfs_dir2_leafn_check(dp, bp);
+ /*
+ * Look up the hash value in the leaf entries.
+ */
+ index = xfs_dir2_leaf_search_hash(args, bp);
+ /*
+ * Do we have a buffer coming in?
+ */
+ if (state->extravalid) {
+ curbp = state->extrablk.bp;
+ curdb = state->extrablk.blkno;
+ }
+ /*
+ * Loop over leaf entries with the right hash value.
+ */
+ for (lep = &leaf->ents[index]; index < be16_to_cpu(leaf->hdr.count) &&
+ be32_to_cpu(lep->hashval) == args->hashval;
+ lep++, index++) {
/*
- * For addname, giving back a free block.
+ * Skip stale leaf entries.
*/
- if (args->addname) {
- state->extrablk.blkno = curfdb;
- state->extrablk.magic = XFS_DIR2_FREE_MAGIC;
+ if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
+ continue;
+ /*
+ * Pull the data block number from the entry.
+ */
+ newdb = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address));
+ /*
+ * Not adding a new entry, so we really want to find
+ * the name given to us.
+ *
+ * If it's a different data block, go get it.
+ */
+ if (newdb != curdb) {
+ /*
+ * If we had a block before that we aren't saving
+ * for a CI name, drop it
+ */
+ if (curbp && (args->cmpresult == XFS_CMP_DIFFERENT ||
+ curdb != state->extrablk.blkno))
+ xfs_da_brelse(tp, curbp);
+ /*
+ * If needing the block that is saved with a CI match,
+ * use it otherwise read in the new data block.
+ */
+ if (args->cmpresult != XFS_CMP_DIFFERENT &&
+ newdb == state->extrablk.blkno) {
+ ASSERT(state->extravalid);
+ curbp = state->extrablk.bp;
+ } else {
+ error = xfs_da_read_buf(tp, dp,
+ xfs_dir2_db_to_da(mp, newdb),
+ -1, &curbp, XFS_DATA_FORK);
+ if (error)
+ return error;
+ }
+ xfs_dir2_data_check(dp, curbp);
+ curdb = newdb;
}
/*
- * For other callers, giving back a data block.
+ * Point to the data entry.
*/
- else {
+ dep = (xfs_dir2_data_entry_t *)((char *)curbp->data +
+ xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)));
+ /*
+ * Compare the entry and if it's an exact match, return
+ * EEXIST immediately. If it's the first case-insensitive
+ * match, store the block & inode number and continue looking.
+ */
+ cmp = mp->m_dirnameops->compname(args, dep->name, dep->namelen);
+ if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+ /* If there is a CI match block, drop it */
+ if (args->cmpresult != XFS_CMP_DIFFERENT &&
+ curdb != state->extrablk.blkno)
+ xfs_da_brelse(tp, state->extrablk.bp);
+ args->cmpresult = cmp;
+ args->inumber = be64_to_cpu(dep->inumber);
+ *indexp = index;
+ state->extravalid = 1;
+ state->extrablk.bp = curbp;
state->extrablk.blkno = curdb;
+ state->extrablk.index = (int)((char *)dep -
+ (char *)curbp->data);
state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+ if (cmp == XFS_CMP_EXACT)
+ return XFS_ERROR(EEXIST);
}
}
- /*
- * Return the final index, that will be the insertion point.
- */
+ ASSERT(index == be16_to_cpu(leaf->hdr.count) ||
+ (args->op_flags & XFS_DA_OP_OKNOENT));
+ if (curbp) {
+ if (args->cmpresult == XFS_CMP_DIFFERENT) {
+ /* Giving back last used data block. */
+ state->extravalid = 1;
+ state->extrablk.bp = curbp;
+ state->extrablk.index = -1;
+ state->extrablk.blkno = curdb;
+ state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+ } else {
+ /* If the curbp is not the CI match block, drop it */
+ if (state->extrablk.bp != curbp)
+ xfs_da_brelse(tp, curbp);
+ }
+ } else {
+ state->extravalid = 0;
+ }
*indexp = index;
- ASSERT(index == be16_to_cpu(leaf->hdr.count) || args->oknoent);
return XFS_ERROR(ENOENT);
}
/*
+ * Look up a leaf entry in a node-format leaf block.
+ * If this is an addname then the extrablk in state is a freespace block,
+ * otherwise it's a data block.
+ */
+int
+xfs_dir2_leafn_lookup_int(
+ xfs_dabuf_t *bp, /* leaf buffer */
+ xfs_da_args_t *args, /* operation arguments */
+ int *indexp, /* out: leaf entry index */
+ xfs_da_state_t *state) /* state to fill in */
+{
+ if (args->op_flags & XFS_DA_OP_ADDNAME)
+ return xfs_dir2_leafn_lookup_for_addname(bp, args, indexp,
+ state);
+ return xfs_dir2_leafn_lookup_for_entry(bp, args, indexp, state);
+}
+
+/*
* Move count leaf entries from source to destination leaf.
* Log entries and headers. Stale entries are preserved.
*/
@@ -823,9 +891,10 @@ xfs_dir2_leafn_rebalance(
*/
if (!state->inleaf)
blk2->index = blk1->index - be16_to_cpu(leaf1->hdr.count);
-
- /*
- * Finally sanity check just to make sure we are not returning a negative index
+
+ /*
+ * Finally sanity check just to make sure we are not returning a
+ * negative index
*/
if(blk2->index < 0) {
state->inleaf = 1;
@@ -1332,7 +1401,7 @@ xfs_dir2_node_addname(
/*
* It worked, fix the hash values up the btree.
*/
- if (!args->justcheck)
+ if (!(args->op_flags & XFS_DA_OP_JUSTCHECK))
xfs_da_fixhashpath(state, &state->path);
} else {
/*
@@ -1515,7 +1584,8 @@ xfs_dir2_node_addname_int(
/*
* Not allowed to allocate, return failure.
*/
- if (args->justcheck || args->total == 0) {
+ if ((args->op_flags & XFS_DA_OP_JUSTCHECK) ||
+ args->total == 0) {
/*
* Drop the freespace buffer unless it came from our
* caller.
@@ -1661,7 +1731,7 @@ xfs_dir2_node_addname_int(
/*
* If just checking, we succeeded.
*/
- if (args->justcheck) {
+ if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
if ((fblk == NULL || fblk->bp == NULL) && fbp != NULL)
xfs_da_buf_done(fbp);
return 0;
@@ -1767,6 +1837,14 @@ xfs_dir2_node_lookup(
error = xfs_da_node_lookup_int(state, &rval);
if (error)
rval = error;
+ else if (rval == ENOENT && args->cmpresult == XFS_CMP_CASE) {
+ /* If a CI match, dup the actual name and return EEXIST */
+ xfs_dir2_data_entry_t *dep;
+
+ dep = (xfs_dir2_data_entry_t *)((char *)state->extrablk.bp->
+ data + state->extrablk.index);
+ rval = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
+ }
/*
* Release the btree blocks and leaf block.
*/
@@ -1810,9 +1888,8 @@ xfs_dir2_node_removename(
* Look up the entry we're deleting, set up the cursor.
*/
error = xfs_da_node_lookup_int(state, &rval);
- if (error) {
+ if (error)
rval = error;
- }
/*
* Didn't find it, upper layer screwed up.
*/
@@ -1829,9 +1906,8 @@ xfs_dir2_node_removename(
*/
error = xfs_dir2_leafn_remove(args, blk->bp, blk->index,
&state->extrablk, &rval);
- if (error) {
+ if (error)
return error;
- }
/*
* Fix the hash values up the btree.
*/
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index 919d275a1ce..b46af0013ec 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -255,7 +255,7 @@ xfs_dir2_block_to_sf(
xfs_dir2_sf_check(args);
out:
xfs_trans_log_inode(args->trans, dp, logflags);
- kmem_free(block, mp->m_dirblksize);
+ kmem_free(block);
return error;
}
@@ -332,7 +332,7 @@ xfs_dir2_sf_addname(
/*
* Just checking or no space reservation, it doesn't fit.
*/
- if (args->justcheck || args->total == 0)
+ if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
return XFS_ERROR(ENOSPC);
/*
* Convert to block form then add the name.
@@ -345,7 +345,7 @@ xfs_dir2_sf_addname(
/*
* Just checking, it fits.
*/
- if (args->justcheck)
+ if (args->op_flags & XFS_DA_OP_JUSTCHECK)
return 0;
/*
* Do it the easy way - just add it at the end.
@@ -512,7 +512,7 @@ xfs_dir2_sf_addname_hard(
sfep = xfs_dir2_sf_nextentry(sfp, sfep);
memcpy(sfep, oldsfep, old_isize - nbytes);
}
- kmem_free(buf, old_isize);
+ kmem_free(buf);
dp->i_d.di_size = new_isize;
xfs_dir2_sf_check(args);
}
@@ -812,8 +812,11 @@ xfs_dir2_sf_lookup(
{
xfs_inode_t *dp; /* incore directory inode */
int i; /* entry index */
+ int error;
xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
xfs_dir2_sf_t *sfp; /* shortform structure */
+ enum xfs_dacmp cmp; /* comparison result */
+ xfs_dir2_sf_entry_t *ci_sfep; /* case-insens. entry */
xfs_dir2_trace_args("sf_lookup", args);
xfs_dir2_sf_check(args);
@@ -836,6 +839,7 @@ xfs_dir2_sf_lookup(
*/
if (args->namelen == 1 && args->name[0] == '.') {
args->inumber = dp->i_ino;
+ args->cmpresult = XFS_CMP_EXACT;
return XFS_ERROR(EEXIST);
}
/*
@@ -844,28 +848,41 @@ xfs_dir2_sf_lookup(
if (args->namelen == 2 &&
args->name[0] == '.' && args->name[1] == '.') {
args->inumber = xfs_dir2_sf_get_inumber(sfp, &sfp->hdr.parent);
+ args->cmpresult = XFS_CMP_EXACT;
return XFS_ERROR(EEXIST);
}
/*
* Loop over all the entries trying to match ours.
*/
- for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
- i < sfp->hdr.count;
- i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
- if (sfep->namelen == args->namelen &&
- sfep->name[0] == args->name[0] &&
- memcmp(args->name, sfep->name, args->namelen) == 0) {
- args->inumber =
- xfs_dir2_sf_get_inumber(sfp,
- xfs_dir2_sf_inumberp(sfep));
- return XFS_ERROR(EEXIST);
+ ci_sfep = NULL;
+ for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->hdr.count;
+ i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
+ /*
+ * Compare name and if it's an exact match, return the inode
+ * number. If it's the first case-insensitive match, store the
+ * inode number and continue looking for an exact match.
+ */
+ cmp = dp->i_mount->m_dirnameops->compname(args, sfep->name,
+ sfep->namelen);
+ if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
+ args->cmpresult = cmp;
+ args->inumber = xfs_dir2_sf_get_inumber(sfp,
+ xfs_dir2_sf_inumberp(sfep));
+ if (cmp == XFS_CMP_EXACT)
+ return XFS_ERROR(EEXIST);
+ ci_sfep = sfep;
}
}
+ ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
/*
- * Didn't find it.
+ * Here, we can only be doing a lookup (not a rename or replace).
+ * If a case-insensitive match was not found, return ENOENT.
*/
- ASSERT(args->oknoent);
- return XFS_ERROR(ENOENT);
+ if (!ci_sfep)
+ return XFS_ERROR(ENOENT);
+ /* otherwise process the CI match as required by the caller */
+ error = xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen);
+ return XFS_ERROR(error);
}
/*
@@ -904,24 +921,21 @@ xfs_dir2_sf_removename(
* Loop over the old directory entries.
* Find the one we're deleting.
*/
- for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
- i < sfp->hdr.count;
- i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
- if (sfep->namelen == args->namelen &&
- sfep->name[0] == args->name[0] &&
- memcmp(sfep->name, args->name, args->namelen) == 0) {
+ for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->hdr.count;
+ i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
+ if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
+ XFS_CMP_EXACT) {
ASSERT(xfs_dir2_sf_get_inumber(sfp,
- xfs_dir2_sf_inumberp(sfep)) ==
- args->inumber);
+ xfs_dir2_sf_inumberp(sfep)) ==
+ args->inumber);
break;
}
}
/*
* Didn't find it.
*/
- if (i == sfp->hdr.count) {
+ if (i == sfp->hdr.count)
return XFS_ERROR(ENOENT);
- }
/*
* Calculate sizes.
*/
@@ -1042,11 +1056,10 @@ xfs_dir2_sf_replace(
*/
else {
for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
- i < sfp->hdr.count;
- i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
- if (sfep->namelen == args->namelen &&
- sfep->name[0] == args->name[0] &&
- memcmp(args->name, sfep->name, args->namelen) == 0) {
+ i < sfp->hdr.count;
+ i++, sfep = xfs_dir2_sf_nextentry(sfp, sfep)) {
+ if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
+ XFS_CMP_EXACT) {
#if XFS_BIG_INUMS || defined(DEBUG)
ino = xfs_dir2_sf_get_inumber(sfp,
xfs_dir2_sf_inumberp(sfep));
@@ -1061,7 +1074,7 @@ xfs_dir2_sf_replace(
* Didn't find it.
*/
if (i == sfp->hdr.count) {
- ASSERT(args->oknoent);
+ ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
#if XFS_BIG_INUMS
if (i8elevated)
xfs_dir2_sf_toino4(args);
@@ -1174,7 +1187,7 @@ xfs_dir2_sf_toino4(
/*
* Clean up the inode.
*/
- kmem_free(buf, oldsize);
+ kmem_free(buf);
dp->i_d.di_size = newsize;
xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
}
@@ -1251,7 +1264,7 @@ xfs_dir2_sf_toino8(
/*
* Clean up the inode.
*/
- kmem_free(buf, oldsize);
+ kmem_free(buf);
dp->i_d.di_size = newsize;
xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
}
diff --git a/fs/xfs/xfs_dir2_sf.h b/fs/xfs/xfs_dir2_sf.h
index 005629d702d..deecc9d238f 100644
--- a/fs/xfs/xfs_dir2_sf.h
+++ b/fs/xfs/xfs_dir2_sf.h
@@ -62,7 +62,7 @@ typedef union {
* Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t.
* Only need 16 bits, this is the byte offset into the single block form.
*/
-typedef struct { __uint8_t i[2]; } xfs_dir2_sf_off_t;
+typedef struct { __uint8_t i[2]; } __arch_pack xfs_dir2_sf_off_t;
/*
* The parent directory has a dedicated field, and the self-pointer must
@@ -76,14 +76,14 @@ typedef struct xfs_dir2_sf_hdr {
__uint8_t count; /* count of entries */
__uint8_t i8count; /* count of 8-byte inode #s */
xfs_dir2_inou_t parent; /* parent dir inode number */
-} xfs_dir2_sf_hdr_t;
+} __arch_pack xfs_dir2_sf_hdr_t;
typedef struct xfs_dir2_sf_entry {
__uint8_t namelen; /* actual name length */
xfs_dir2_sf_off_t offset; /* saved offset */
__uint8_t name[1]; /* name, variable size */
xfs_dir2_inou_t inumber; /* inode number, var. offset */
-} xfs_dir2_sf_entry_t;
+} __arch_pack xfs_dir2_sf_entry_t;
typedef struct xfs_dir2_sf {
xfs_dir2_sf_hdr_t hdr; /* shortform header */
diff --git a/fs/xfs/xfs_dir2_trace.c b/fs/xfs/xfs_dir2_trace.c
index f3fb2ffd6f5..6cc7c0c681a 100644
--- a/fs/xfs/xfs_dir2_trace.c
+++ b/fs/xfs/xfs_dir2_trace.c
@@ -85,7 +85,8 @@ xfs_dir2_trace_args(
(void *)((unsigned long)(args->inumber >> 32)),
(void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
(void *)args->dp, (void *)args->trans,
- (void *)(unsigned long)args->justcheck, NULL, NULL);
+ (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
+ NULL, NULL);
}
void
@@ -100,7 +101,7 @@ xfs_dir2_trace_args_b(
(void *)((unsigned long)(args->inumber >> 32)),
(void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
(void *)args->dp, (void *)args->trans,
- (void *)(unsigned long)args->justcheck,
+ (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
(void *)(bp ? bp->bps[0] : NULL), NULL);
}
@@ -117,7 +118,7 @@ xfs_dir2_trace_args_bb(
(void *)((unsigned long)(args->inumber >> 32)),
(void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
(void *)args->dp, (void *)args->trans,
- (void *)(unsigned long)args->justcheck,
+ (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
(void *)(lbp ? lbp->bps[0] : NULL),
(void *)(dbp ? dbp->bps[0] : NULL));
}
@@ -157,8 +158,8 @@ xfs_dir2_trace_args_db(
(void *)((unsigned long)(args->inumber >> 32)),
(void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
(void *)args->dp, (void *)args->trans,
- (void *)(unsigned long)args->justcheck, (void *)(long)db,
- (void *)dbp);
+ (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
+ (void *)(long)db, (void *)dbp);
}
void
@@ -173,7 +174,7 @@ xfs_dir2_trace_args_i(
(void *)((unsigned long)(args->inumber >> 32)),
(void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
(void *)args->dp, (void *)args->trans,
- (void *)(unsigned long)args->justcheck,
+ (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
(void *)((unsigned long)(i >> 32)),
(void *)((unsigned long)(i & 0xFFFFFFFF)));
}
@@ -190,7 +191,8 @@ xfs_dir2_trace_args_s(
(void *)((unsigned long)(args->inumber >> 32)),
(void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
(void *)args->dp, (void *)args->trans,
- (void *)(unsigned long)args->justcheck, (void *)(long)s, NULL);
+ (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
+ (void *)(long)s, NULL);
}
void
@@ -208,7 +210,7 @@ xfs_dir2_trace_args_sb(
(void *)((unsigned long)(args->inumber >> 32)),
(void *)((unsigned long)(args->inumber & 0xFFFFFFFF)),
(void *)args->dp, (void *)args->trans,
- (void *)(unsigned long)args->justcheck, (void *)(long)s,
- (void *)dbp);
+ (void *)(unsigned long)(args->op_flags & XFS_DA_OP_JUSTCHECK),
+ (void *)(long)s, (void *)dbp);
}
#endif /* XFS_DIR2_TRACE */
diff --git a/fs/xfs/xfs_dmapi.h b/fs/xfs/xfs_dmapi.h
index f71784ab6a6..2813cdd7237 100644
--- a/fs/xfs/xfs_dmapi.h
+++ b/fs/xfs/xfs_dmapi.h
@@ -18,7 +18,6 @@
#ifndef __XFS_DMAPI_H__
#define __XFS_DMAPI_H__
-#include <linux/version.h>
/* Values used to define the on-disk version of dm_attrname_t. All
* on-disk attribute names start with the 8-byte string "SGI_DMI_".
*
@@ -166,6 +165,6 @@ typedef enum {
#define FILP_DELAY_FLAG(filp) ((filp->f_flags&(O_NDELAY|O_NONBLOCK)) ? \
DM_FLAGS_NDELAY : 0)
-#define AT_DELAY_FLAG(f) ((f&ATTR_NONBLOCK) ? DM_FLAGS_NDELAY : 0)
+#define AT_DELAY_FLAG(f) ((f & XFS_ATTR_NONBLOCK) ? DM_FLAGS_NDELAY : 0)
#endif /* __XFS_DMAPI_H__ */
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 05e5365d3c3..f227ecd1a29 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -58,22 +58,11 @@ xfs_error_trap(int e)
}
return e;
}
-#endif
-
-#if (defined(DEBUG) || defined(INDUCE_IO_ERROR))
int xfs_etest[XFS_NUM_INJECT_ERROR];
int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR];
char * xfs_etest_fsname[XFS_NUM_INJECT_ERROR];
-void
-xfs_error_test_init(void)
-{
- memset(xfs_etest, 0, sizeof(xfs_etest));
- memset(xfs_etest_fsid, 0, sizeof(xfs_etest_fsid));
- memset(xfs_etest_fsname, 0, sizeof(xfs_etest_fsname));
-}
-
int
xfs_error_test(int error_tag, int *fsidp, char *expression,
int line, char *file, unsigned long randfactor)
@@ -150,8 +139,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
xfs_etest[i]);
xfs_etest[i] = 0;
xfs_etest_fsid[i] = 0LL;
- kmem_free(xfs_etest_fsname[i],
- strlen(xfs_etest_fsname[i]) + 1);
+ kmem_free(xfs_etest_fsname[i]);
xfs_etest_fsname[i] = NULL;
}
}
@@ -163,7 +151,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
return 0;
}
-#endif /* DEBUG || INDUCE_IO_ERROR */
+#endif /* DEBUG */
static void
xfs_fs_vcmn_err(int level, xfs_mount_t *mp, char *fmt, va_list ap)
@@ -175,7 +163,7 @@ xfs_fs_vcmn_err(int level, xfs_mount_t *mp, char *fmt, va_list ap)
newfmt = kmem_alloc(len, KM_SLEEP);
sprintf(newfmt, "Filesystem \"%s\": %s", mp->m_fsname, fmt);
icmn_err(level, newfmt, ap);
- kmem_free(newfmt, len);
+ kmem_free(newfmt);
} else {
icmn_err(level, fmt, ap);
}
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 6490d2a9f8e..11543f10b0c 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -125,23 +125,14 @@ extern void xfs_corruption_error(char *tag, int level, struct xfs_mount *mp,
#define XFS_RANDOM_DIOWRITE_IOERR (XFS_RANDOM_DEFAULT/10)
#define XFS_RANDOM_BMAPIFORMAT XFS_RANDOM_DEFAULT
-#if (defined(DEBUG) || defined(INDUCE_IO_ERROR))
+#ifdef DEBUG
extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
-extern void xfs_error_test_init(void);
#define XFS_NUM_INJECT_ERROR 10
-
-#ifdef __ANSI_CPP__
-#define XFS_TEST_ERROR(expr, mp, tag, rf) \
- ((expr) || \
- xfs_error_test((tag), (mp)->m_fixedfsid, #expr, __LINE__, __FILE__, \
- (rf)))
-#else
#define XFS_TEST_ERROR(expr, mp, tag, rf) \
((expr) || \
xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
(rf)))
-#endif /* __ANSI_CPP__ */
extern int xfs_errortag_add(int error_tag, xfs_mount_t *mp);
extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
@@ -149,7 +140,7 @@ extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
#define XFS_TEST_ERROR(expr, mp, tag, rf) (expr)
#define xfs_errortag_add(tag, mp) (ENOSYS)
#define xfs_errortag_clearall(mp, loud) (ENOSYS)
-#endif /* (DEBUG || INDUCE_IO_ERROR) */
+#endif /* DEBUG */
/*
* XFS panic tags -- allow a call to xfs_cmn_err() be turned into
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 132bd07b9bb..8aa28f751b2 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -41,8 +41,7 @@ xfs_efi_item_free(xfs_efi_log_item_t *efip)
int nexts = efip->efi_format.efi_nextents;
if (nexts > XFS_EFI_MAX_FAST_EXTENTS) {
- kmem_free(efip, sizeof(xfs_efi_log_item_t) +
- (nexts - 1) * sizeof(xfs_extent_t));
+ kmem_free(efip);
} else {
kmem_zone_free(xfs_efi_zone, efip);
}
@@ -374,8 +373,7 @@ xfs_efd_item_free(xfs_efd_log_item_t *efdp)
int nexts = efdp->efd_format.efd_nextents;
if (nexts > XFS_EFD_MAX_FAST_EXTENTS) {
- kmem_free(efdp, sizeof(xfs_efd_log_item_t) +
- (nexts - 1) * sizeof(xfs_extent_t));
+ kmem_free(efdp);
} else {
kmem_zone_free(xfs_efd_zone, efdp);
}
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 3f3785b1080..f3bb75da384 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -397,10 +397,12 @@ int
xfs_filestream_init(void)
{
item_zone = kmem_zone_init(sizeof(fstrm_item_t), "fstrm_item");
+ if (!item_zone)
+ return -ENOMEM;
#ifdef XFS_FILESTREAMS_TRACE
- xfs_filestreams_trace_buf = ktrace_alloc(XFS_FSTRM_KTRACE_SIZE, KM_SLEEP);
+ xfs_filestreams_trace_buf = ktrace_alloc(XFS_FSTRM_KTRACE_SIZE, KM_NOFS);
#endif
- return item_zone ? 0 : -ENOMEM;
+ return 0;
}
/*
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 3bed6433d05..01c0cc88d3f 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -239,6 +239,7 @@ typedef struct xfs_fsop_resblks {
#define XFS_FSOP_GEOM_FLAGS_LOGV2 0x0100 /* log format version 2 */
#define XFS_FSOP_GEOM_FLAGS_SECTOR 0x0200 /* sector sizes >1BB */
#define XFS_FSOP_GEOM_FLAGS_ATTR2 0x0400 /* inline attributes rework */
+#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */
#define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */
@@ -371,6 +372,9 @@ typedef struct xfs_fsop_attrlist_handlereq {
typedef struct xfs_attr_multiop {
__u32 am_opcode;
+#define ATTR_OP_GET 1 /* return the indicated attr's value */
+#define ATTR_OP_SET 2 /* set/create the indicated attr/value pair */
+#define ATTR_OP_REMOVE 3 /* remove the indicated attr */
__s32 am_error;
void __user *am_attrname;
void __user *am_attrvalue;
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 381ebda4f7b..84583cf73db 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -95,6 +95,8 @@ xfs_fs_geometry(
XFS_FSOP_GEOM_FLAGS_DIRV2 : 0) |
(xfs_sb_version_hassector(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_SECTOR : 0) |
+ (xfs_sb_version_hasasciici(&mp->m_sb) ?
+ XFS_FSOP_GEOM_FLAGS_DIRV2CI : 0) |
(xfs_sb_version_haslazysbcount(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_LAZYSB : 0) |
(xfs_sb_version_hasattr2(&mp->m_sb) ?
@@ -625,7 +627,7 @@ xfs_fs_goingdown(
xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
thaw_bdev(sb->s_bdev, sb);
}
-
+
break;
}
case XFS_FSOP_GOING_FLAGS_LOGFLUSH:
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index e5310c90e50..83502f3edef 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -181,7 +181,7 @@ xfs_inobt_delrec(
* then we can get rid of this level.
*/
if (numrecs == 1 && level > 0) {
- agbp = cur->bc_private.i.agbp;
+ agbp = cur->bc_private.a.agbp;
agi = XFS_BUF_TO_AGI(agbp);
/*
* pp is still set to the first pointer in the block.
@@ -194,7 +194,7 @@ xfs_inobt_delrec(
* Free the block.
*/
if ((error = xfs_free_extent(cur->bc_tp,
- XFS_AGB_TO_FSB(mp, cur->bc_private.i.agno, bno), 1)))
+ XFS_AGB_TO_FSB(mp, cur->bc_private.a.agno, bno), 1)))
return error;
xfs_trans_binval(cur->bc_tp, bp);
xfs_ialloc_log_agi(cur->bc_tp, agbp,
@@ -379,7 +379,7 @@ xfs_inobt_delrec(
rrecs = be16_to_cpu(right->bb_numrecs);
rbp = bp;
if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
- cur->bc_private.i.agno, lbno, 0, &lbp,
+ cur->bc_private.a.agno, lbno, 0, &lbp,
XFS_INO_BTREE_REF)))
return error;
left = XFS_BUF_TO_INOBT_BLOCK(lbp);
@@ -401,7 +401,7 @@ xfs_inobt_delrec(
lrecs = be16_to_cpu(left->bb_numrecs);
lbp = bp;
if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
- cur->bc_private.i.agno, rbno, 0, &rbp,
+ cur->bc_private.a.agno, rbno, 0, &rbp,
XFS_INO_BTREE_REF)))
return error;
right = XFS_BUF_TO_INOBT_BLOCK(rbp);
@@ -484,7 +484,7 @@ xfs_inobt_delrec(
xfs_buf_t *rrbp;
if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
- cur->bc_private.i.agno, be32_to_cpu(left->bb_rightsib), 0,
+ cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib), 0,
&rrbp, XFS_INO_BTREE_REF)))
return error;
rrblock = XFS_BUF_TO_INOBT_BLOCK(rrbp);
@@ -497,7 +497,7 @@ xfs_inobt_delrec(
* Free the deleting block.
*/
if ((error = xfs_free_extent(cur->bc_tp, XFS_AGB_TO_FSB(mp,
- cur->bc_private.i.agno, rbno), 1)))
+ cur->bc_private.a.agno, rbno), 1)))
return error;
xfs_trans_binval(cur->bc_tp, rbp);
/*
@@ -854,7 +854,7 @@ xfs_inobt_lookup(
{
xfs_agi_t *agi; /* a.g. inode header */
- agi = XFS_BUF_TO_AGI(cur->bc_private.i.agbp);
+ agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
agno = be32_to_cpu(agi->agi_seqno);
agbno = be32_to_cpu(agi->agi_root);
}
@@ -1089,7 +1089,7 @@ xfs_inobt_lshift(
* Set up the left neighbor as "left".
*/
if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
- cur->bc_private.i.agno, be32_to_cpu(right->bb_leftsib),
+ cur->bc_private.a.agno, be32_to_cpu(right->bb_leftsib),
0, &lbp, XFS_INO_BTREE_REF)))
return error;
left = XFS_BUF_TO_INOBT_BLOCK(lbp);
@@ -1207,10 +1207,10 @@ xfs_inobt_newroot(
/*
* Get a block & a buffer.
*/
- agi = XFS_BUF_TO_AGI(cur->bc_private.i.agbp);
+ agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
args.tp = cur->bc_tp;
args.mp = cur->bc_mp;
- args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.i.agno,
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno,
be32_to_cpu(agi->agi_root));
args.mod = args.minleft = args.alignment = args.total = args.wasdel =
args.isfl = args.userdata = args.minalignslop = 0;
@@ -1233,7 +1233,7 @@ xfs_inobt_newroot(
*/
agi->agi_root = cpu_to_be32(args.agbno);
be32_add_cpu(&agi->agi_level, 1);
- xfs_ialloc_log_agi(args.tp, cur->bc_private.i.agbp,
+ xfs_ialloc_log_agi(args.tp, cur->bc_private.a.agbp,
XFS_AGI_ROOT | XFS_AGI_LEVEL);
/*
* At the previous root level there are now two blocks: the old
@@ -1376,7 +1376,7 @@ xfs_inobt_rshift(
* Set up the right neighbor as "right".
*/
if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
- cur->bc_private.i.agno, be32_to_cpu(left->bb_rightsib),
+ cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib),
0, &rbp, XFS_INO_BTREE_REF)))
return error;
right = XFS_BUF_TO_INOBT_BLOCK(rbp);
@@ -1492,7 +1492,7 @@ xfs_inobt_split(
* Allocate the new block.
* If we can't do it, we're toast. Give up.
*/
- args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.i.agno, lbno);
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, lbno);
args.mod = args.minleft = args.alignment = args.total = args.wasdel =
args.isfl = args.userdata = args.minalignslop = 0;
args.minlen = args.maxlen = args.prod = 1;
@@ -1725,7 +1725,7 @@ xfs_inobt_decrement(
agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
- cur->bc_private.i.agno, agbno, 0, &bp,
+ cur->bc_private.a.agno, agbno, 0, &bp,
XFS_INO_BTREE_REF)))
return error;
lev--;
@@ -1897,7 +1897,7 @@ xfs_inobt_increment(
agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
- cur->bc_private.i.agno, agbno, 0, &bp,
+ cur->bc_private.a.agno, agbno, 0, &bp,
XFS_INO_BTREE_REF)))
return error;
lev--;
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index b07604b94d9..e229e9e001c 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -216,7 +216,14 @@ finish_inode:
mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
init_waitqueue_head(&ip->i_ipin_wait);
atomic_set(&ip->i_pincount, 0);
- initnsema(&ip->i_flock, 1, "xfsfino");
+
+ /*
+ * Because we want to use a counting completion, complete
+ * the flush completion once to allow a single access to
+ * the flush completion without blocking.
+ */
+ init_completion(&ip->i_flush);
+ complete(&ip->i_flush);
if (lock_flags)
xfs_ilock(ip, lock_flags);
@@ -288,10 +295,17 @@ finish_inode:
*ipp = ip;
/*
+ * Set up the Linux with the Linux inode.
+ */
+ ip->i_vnode = inode;
+ inode->i_private = ip;
+
+ /*
* If we have a real type for an on-disk inode, we can set ops(&unlock)
* now. If it's a new inode being created, xfs_ialloc will handle it.
*/
- xfs_initialize_vnode(mp, inode, ip);
+ if (ip->i_d.di_mode != 0)
+ xfs_setup_inode(ip);
return 0;
}
@@ -411,10 +425,11 @@ xfs_iput(xfs_inode_t *ip,
* Special iput for brand-new inodes that are still locked
*/
void
-xfs_iput_new(xfs_inode_t *ip,
- uint lock_flags)
+xfs_iput_new(
+ xfs_inode_t *ip,
+ uint lock_flags)
{
- struct inode *inode = ip->i_vnode;
+ struct inode *inode = VFS_I(ip);
xfs_itrace_entry(ip);
@@ -775,26 +790,3 @@ xfs_isilocked(
}
#endif
-/*
- * The following three routines simply manage the i_flock
- * semaphore embedded in the inode. This semaphore synchronizes
- * processes attempting to flush the in-core inode back to disk.
- */
-void
-xfs_iflock(xfs_inode_t *ip)
-{
- psema(&(ip->i_flock), PINOD|PLTWAIT);
-}
-
-int
-xfs_iflock_nowait(xfs_inode_t *ip)
-{
- return (cpsema(&(ip->i_flock)));
-}
-
-void
-xfs_ifunlock(xfs_inode_t *ip)
-{
- ASSERT(issemalocked(&(ip->i_flock)));
- vsema(&(ip->i_flock));
-}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index e569bf5d6cf..dbd9cef852e 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -580,8 +580,8 @@ xfs_iformat_extents(
xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip));
for (i = 0; i < nex; i++, dp++) {
xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
- ep->l0 = be64_to_cpu(get_unaligned(&dp->l0));
- ep->l1 = be64_to_cpu(get_unaligned(&dp->l1));
+ ep->l0 = get_unaligned_be64(&dp->l0);
+ ep->l1 = get_unaligned_be64(&dp->l1);
}
XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork);
if (whichfork != XFS_DATA_FORK ||
@@ -835,22 +835,22 @@ xfs_iread(
* Do this before xfs_iformat in case it adds entries.
*/
#ifdef XFS_INODE_TRACE
- ip->i_trace = ktrace_alloc(INODE_TRACE_SIZE, KM_SLEEP);
+ ip->i_trace = ktrace_alloc(INODE_TRACE_SIZE, KM_NOFS);
#endif
#ifdef XFS_BMAP_TRACE
- ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_SLEEP);
+ ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_NOFS);
#endif
#ifdef XFS_BMBT_TRACE
- ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_SLEEP);
+ ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_NOFS);
#endif
#ifdef XFS_RW_TRACE
- ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_SLEEP);
+ ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_NOFS);
#endif
#ifdef XFS_ILOCK_TRACE
- ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_SLEEP);
+ ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_NOFS);
#endif
#ifdef XFS_DIR2_TRACE
- ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_SLEEP);
+ ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS);
#endif
/*
@@ -1046,9 +1046,9 @@ xfs_ialloc(
{
xfs_ino_t ino;
xfs_inode_t *ip;
- bhv_vnode_t *vp;
uint flags;
int error;
+ timespec_t tv;
/*
* Call the space management code to pick
@@ -1077,13 +1077,12 @@ xfs_ialloc(
}
ASSERT(ip != NULL);
- vp = XFS_ITOV(ip);
ip->i_d.di_mode = (__uint16_t)mode;
ip->i_d.di_onlink = 0;
ip->i_d.di_nlink = nlink;
ASSERT(ip->i_d.di_nlink == nlink);
- ip->i_d.di_uid = current_fsuid(cr);
- ip->i_d.di_gid = current_fsgid(cr);
+ ip->i_d.di_uid = current_fsuid();
+ ip->i_d.di_gid = current_fsgid();
ip->i_d.di_projid = prid;
memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
@@ -1130,7 +1129,13 @@ xfs_ialloc(
ip->i_size = 0;
ip->i_d.di_nextents = 0;
ASSERT(ip->i_d.di_nblocks == 0);
- xfs_ichgtime(ip, XFS_ICHGTIME_CHG|XFS_ICHGTIME_ACC|XFS_ICHGTIME_MOD);
+
+ nanotime(&tv);
+ ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
+ ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
+ ip->i_d.di_atime = ip->i_d.di_mtime;
+ ip->i_d.di_ctime = ip->i_d.di_mtime;
+
/*
* di_gen will have been taken care of in xfs_iread.
*/
@@ -1220,7 +1225,7 @@ xfs_ialloc(
xfs_trans_log_inode(tp, ip, flags);
/* now that we have an i_mode we can setup inode ops and unlock */
- xfs_initialize_vnode(tp->t_mountp, vp, ip);
+ xfs_setup_inode(ip);
*ipp = ip;
return 0;
@@ -1399,7 +1404,6 @@ xfs_itruncate_start(
xfs_fsize_t last_byte;
xfs_off_t toss_start;
xfs_mount_t *mp;
- bhv_vnode_t *vp;
int error = 0;
ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
@@ -1408,7 +1412,6 @@ xfs_itruncate_start(
(flags == XFS_ITRUNC_MAYBE));
mp = ip->i_mount;
- vp = XFS_ITOV(ip);
/* wait for the completion of any pending DIOs */
if (new_size < ip->i_size)
@@ -1457,7 +1460,7 @@ xfs_itruncate_start(
#ifdef DEBUG
if (new_size == 0) {
- ASSERT(VN_CACHED(vp) == 0);
+ ASSERT(VN_CACHED(VFS_I(ip)) == 0);
}
#endif
return error;
@@ -1763,67 +1766,6 @@ xfs_itruncate_finish(
return 0;
}
-
-/*
- * xfs_igrow_start
- *
- * Do the first part of growing a file: zero any data in the last
- * block that is beyond the old EOF. We need to do this before
- * the inode is joined to the transaction to modify the i_size.
- * That way we can drop the inode lock and call into the buffer
- * cache to get the buffer mapping the EOF.
- */
-int
-xfs_igrow_start(
- xfs_inode_t *ip,
- xfs_fsize_t new_size,
- cred_t *credp)
-{
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
- ASSERT(new_size > ip->i_size);
-
- /*
- * Zero any pages that may have been created by
- * xfs_write_file() beyond the end of the file
- * and any blocks between the old and new file sizes.
- */
- return xfs_zero_eof(ip, new_size, ip->i_size);
-}
-
-/*
- * xfs_igrow_finish
- *
- * This routine is called to extend the size of a file.
- * The inode must have both the iolock and the ilock locked
- * for update and it must be a part of the current transaction.
- * The xfs_igrow_start() function must have been called previously.
- * If the change_flag is not zero, the inode change timestamp will
- * be updated.
- */
-void
-xfs_igrow_finish(
- xfs_trans_t *tp,
- xfs_inode_t *ip,
- xfs_fsize_t new_size,
- int change_flag)
-{
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
- ASSERT(ip->i_transp == tp);
- ASSERT(new_size > ip->i_size);
-
- /*
- * Update the file size. Update the inode change timestamp
- * if change_flag set.
- */
- ip->i_d.di_size = new_size;
- ip->i_size = new_size;
- if (change_flag)
- xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
-}
-
-
/*
* This is called when the inode's link count goes to 0.
* We place the on-disk inode on a list in the AGI. It
@@ -2258,7 +2200,7 @@ xfs_ifree_cluster(
xfs_trans_binval(tp, bp);
}
- kmem_free(ip_found, ninodes * sizeof(xfs_inode_t *));
+ kmem_free(ip_found);
xfs_put_perag(mp, pag);
}
@@ -2470,7 +2412,7 @@ xfs_iroot_realloc(
(int)new_size);
memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
}
- kmem_free(ifp->if_broot, ifp->if_broot_bytes);
+ kmem_free(ifp->if_broot);
ifp->if_broot = new_broot;
ifp->if_broot_bytes = (int)new_size;
ASSERT(ifp->if_broot_bytes <=
@@ -2514,7 +2456,7 @@ xfs_idata_realloc(
if (new_size == 0) {
if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
- kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes);
+ kmem_free(ifp->if_u1.if_data);
}
ifp->if_u1.if_data = NULL;
real_size = 0;
@@ -2529,7 +2471,7 @@ xfs_idata_realloc(
ASSERT(ifp->if_real_bytes != 0);
memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data,
new_size);
- kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes);
+ kmem_free(ifp->if_u1.if_data);
ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
}
real_size = 0;
@@ -2636,7 +2578,7 @@ xfs_idestroy_fork(
ifp = XFS_IFORK_PTR(ip, whichfork);
if (ifp->if_broot != NULL) {
- kmem_free(ifp->if_broot, ifp->if_broot_bytes);
+ kmem_free(ifp->if_broot);
ifp->if_broot = NULL;
}
@@ -2650,7 +2592,7 @@ xfs_idestroy_fork(
if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) &&
(ifp->if_u1.if_data != NULL)) {
ASSERT(ifp->if_real_bytes != 0);
- kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes);
+ kmem_free(ifp->if_u1.if_data);
ifp->if_u1.if_data = NULL;
ifp->if_real_bytes = 0;
}
@@ -2691,7 +2633,6 @@ xfs_idestroy(
xfs_idestroy_fork(ip, XFS_ATTR_FORK);
mrfree(&ip->i_lock);
mrfree(&ip->i_iolock);
- freesema(&ip->i_flock);
#ifdef XFS_INODE_TRACE
ktrace_free(ip->i_trace);
@@ -3058,7 +2999,7 @@ xfs_iflush_cluster(
out_free:
read_unlock(&pag->pag_ici_lock);
- kmem_free(ilist, ilist_size);
+ kmem_free(ilist);
return 0;
@@ -3102,17 +3043,17 @@ cluster_corrupt_out:
* Unlocks the flush lock
*/
xfs_iflush_abort(iq);
- kmem_free(ilist, ilist_size);
+ kmem_free(ilist);
return XFS_ERROR(EFSCORRUPTED);
}
/*
* xfs_iflush() will write a modified inode's changes out to the
* inode's on disk home. The caller must have the inode lock held
- * in at least shared mode and the inode flush semaphore must be
- * held as well. The inode lock will still be held upon return from
+ * in at least shared mode and the inode flush completion must be
+ * active as well. The inode lock will still be held upon return from
* the call and the caller is free to unlock it.
- * The inode flush lock will be unlocked when the inode reaches the disk.
+ * The inode flush will be completed when the inode reaches the disk.
* The flags indicate how the inode's buffer should be written out.
*/
int
@@ -3131,7 +3072,7 @@ xfs_iflush(
XFS_STATS_INC(xs_iflush_count);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
- ASSERT(issemalocked(&(ip->i_flock)));
+ ASSERT(!completion_done(&ip->i_flush));
ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
ip->i_d.di_nextents > ip->i_df.if_ext_max);
@@ -3143,8 +3084,6 @@ xfs_iflush(
* flush lock and do nothing.
*/
if (xfs_inode_clean(ip)) {
- ASSERT((iip != NULL) ?
- !(iip->ili_item.li_flags & XFS_LI_IN_AIL) : 1);
xfs_ifunlock(ip);
return 0;
}
@@ -3296,7 +3235,7 @@ xfs_iflush_int(
#endif
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
- ASSERT(issemalocked(&(ip->i_flock)));
+ ASSERT(!completion_done(&ip->i_flush));
ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
ip->i_d.di_nextents > ip->i_df.if_ext_max);
@@ -3528,7 +3467,6 @@ xfs_iflush_all(
xfs_mount_t *mp)
{
xfs_inode_t *ip;
- bhv_vnode_t *vp;
again:
XFS_MOUNT_ILOCK(mp);
@@ -3543,14 +3481,13 @@ xfs_iflush_all(
continue;
}
- vp = XFS_ITOV_NULL(ip);
- if (!vp) {
+ if (!VFS_I(ip)) {
XFS_MOUNT_IUNLOCK(mp);
xfs_finish_reclaim(ip, 0, XFS_IFLUSH_ASYNC);
goto again;
}
- ASSERT(vn_count(vp) == 0);
+ ASSERT(vn_count(VFS_I(ip)) == 0);
ip = ip->i_mnext;
} while (ip != mp->m_inodes);
@@ -3770,7 +3707,7 @@ xfs_iext_add_indirect_multi(
* (all extents past */
if (nex2) {
byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
- nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_SLEEP);
+ nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS);
memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff);
erp->er_extcount -= nex2;
xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2);
@@ -3836,7 +3773,7 @@ xfs_iext_add_indirect_multi(
erp = xfs_iext_irec_new(ifp, erp_idx);
}
memmove(&erp->er_extbuf[i], nex2_ep, byte_diff);
- kmem_free(nex2_ep, byte_diff);
+ kmem_free(nex2_ep);
erp->er_extcount += nex2;
xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2);
}
@@ -4070,8 +4007,7 @@ xfs_iext_realloc_direct(
ifp->if_u1.if_extents =
kmem_realloc(ifp->if_u1.if_extents,
rnew_size,
- ifp->if_real_bytes,
- KM_SLEEP);
+ ifp->if_real_bytes, KM_NOFS);
}
if (rnew_size > ifp->if_real_bytes) {
memset(&ifp->if_u1.if_extents[ifp->if_bytes /
@@ -4112,7 +4048,7 @@ xfs_iext_direct_to_inline(
*/
memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents,
nextents * sizeof(xfs_bmbt_rec_t));
- kmem_free(ifp->if_u1.if_extents, ifp->if_real_bytes);
+ kmem_free(ifp->if_u1.if_extents);
ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
ifp->if_real_bytes = 0;
}
@@ -4130,7 +4066,7 @@ xfs_iext_inline_to_direct(
xfs_ifork_t *ifp, /* inode fork pointer */
int new_size) /* number of extents in file */
{
- ifp->if_u1.if_extents = kmem_alloc(new_size, KM_SLEEP);
+ ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS);
memset(ifp->if_u1.if_extents, 0, new_size);
if (ifp->if_bytes) {
memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext,
@@ -4162,7 +4098,7 @@ xfs_iext_realloc_indirect(
} else {
ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *)
kmem_realloc(ifp->if_u1.if_ext_irec,
- new_size, size, KM_SLEEP);
+ new_size, size, KM_NOFS);
}
}
@@ -4182,11 +4118,11 @@ xfs_iext_indirect_to_direct(
ASSERT(nextents <= XFS_LINEAR_EXTS);
size = nextents * sizeof(xfs_bmbt_rec_t);
- xfs_iext_irec_compact_full(ifp);
+ xfs_iext_irec_compact_pages(ifp);
ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ);
ep = ifp->if_u1.if_ext_irec->er_extbuf;
- kmem_free(ifp->if_u1.if_ext_irec, sizeof(xfs_ext_irec_t));
+ kmem_free(ifp->if_u1.if_ext_irec);
ifp->if_flags &= ~XFS_IFEXTIREC;
ifp->if_u1.if_extents = ep;
ifp->if_bytes = size;
@@ -4212,7 +4148,7 @@ xfs_iext_destroy(
}
ifp->if_flags &= ~XFS_IFEXTIREC;
} else if (ifp->if_real_bytes) {
- kmem_free(ifp->if_u1.if_extents, ifp->if_real_bytes);
+ kmem_free(ifp->if_u1.if_extents);
} else if (ifp->if_bytes) {
memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
sizeof(xfs_bmbt_rec_t));
@@ -4404,11 +4340,10 @@ xfs_iext_irec_init(
nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
ASSERT(nextents <= XFS_LINEAR_EXTS);
- erp = (xfs_ext_irec_t *)
- kmem_alloc(sizeof(xfs_ext_irec_t), KM_SLEEP);
+ erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS);
if (nextents == 0) {
- ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_SLEEP);
+ ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
} else if (!ifp->if_real_bytes) {
xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ);
} else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) {
@@ -4456,7 +4391,7 @@ xfs_iext_irec_new(
/* Initialize new extent record */
erp = ifp->if_u1.if_ext_irec;
- erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_SLEEP);
+ erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ);
erp[erp_idx].er_extcount = 0;
@@ -4483,7 +4418,7 @@ xfs_iext_irec_remove(
if (erp->er_extbuf) {
xfs_iext_irec_update_extoffs(ifp, erp_idx + 1,
-erp->er_extcount);
- kmem_free(erp->er_extbuf, XFS_IEXT_BUFSZ);
+ kmem_free(erp->er_extbuf);
}
/* Compact extent records */
erp = ifp->if_u1.if_ext_irec;
@@ -4501,8 +4436,7 @@ xfs_iext_irec_remove(
xfs_iext_realloc_indirect(ifp,
nlists * sizeof(xfs_ext_irec_t));
} else {
- kmem_free(ifp->if_u1.if_ext_irec,
- sizeof(xfs_ext_irec_t));
+ kmem_free(ifp->if_u1.if_ext_irec);
}
ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
}
@@ -4515,8 +4449,7 @@ xfs_iext_irec_remove(
* compaction policy is as follows:
*
* Full Compaction: Extents fit into a single page (or inline buffer)
- * Full Compaction: Extents occupy less than 10% of allocated space
- * Partial Compaction: Extents occupy > 10% and < 50% of allocated space
+ * Partial Compaction: Extents occupy less than 50% of allocated space
* No Compaction: Extents occupy at least 50% of allocated space
*/
void
@@ -4537,8 +4470,6 @@ xfs_iext_irec_compact(
xfs_iext_direct_to_inline(ifp, nextents);
} else if (nextents <= XFS_LINEAR_EXTS) {
xfs_iext_indirect_to_direct(ifp);
- } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 3) {
- xfs_iext_irec_compact_full(ifp);
} else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) {
xfs_iext_irec_compact_pages(ifp);
}
@@ -4562,7 +4493,7 @@ xfs_iext_irec_compact_pages(
erp_next = erp + 1;
if (erp_next->er_extcount <=
(XFS_LINEAR_EXTS - erp->er_extcount)) {
- memmove(&erp->er_extbuf[erp->er_extcount],
+ memcpy(&erp->er_extbuf[erp->er_extcount],
erp_next->er_extbuf, erp_next->er_extcount *
sizeof(xfs_bmbt_rec_t));
erp->er_extcount += erp_next->er_extcount;
@@ -4571,75 +4502,13 @@ xfs_iext_irec_compact_pages(
* so er_extoffs don't get modified in
* xfs_iext_irec_remove.
*/
- kmem_free(erp_next->er_extbuf, XFS_IEXT_BUFSZ);
- erp_next->er_extbuf = NULL;
- xfs_iext_irec_remove(ifp, erp_idx + 1);
- nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
- } else {
- erp_idx++;
- }
- }
-}
-
-/*
- * Fully compact the extent records managed by the indirection array.
- */
-void
-xfs_iext_irec_compact_full(
- xfs_ifork_t *ifp) /* inode fork pointer */
-{
- xfs_bmbt_rec_host_t *ep, *ep_next; /* extent record pointers */
- xfs_ext_irec_t *erp, *erp_next; /* extent irec pointers */
- int erp_idx = 0; /* extent irec index */
- int ext_avail; /* empty entries in ex list */
- int ext_diff; /* number of exts to add */
- int nlists; /* number of irec's (ex lists) */
-
- ASSERT(ifp->if_flags & XFS_IFEXTIREC);
- nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
- erp = ifp->if_u1.if_ext_irec;
- ep = &erp->er_extbuf[erp->er_extcount];
- erp_next = erp + 1;
- ep_next = erp_next->er_extbuf;
- while (erp_idx < nlists - 1) {
- ext_avail = XFS_LINEAR_EXTS - erp->er_extcount;
- ext_diff = MIN(ext_avail, erp_next->er_extcount);
- memcpy(ep, ep_next, ext_diff * sizeof(xfs_bmbt_rec_t));
- erp->er_extcount += ext_diff;
- erp_next->er_extcount -= ext_diff;
- /* Remove next page */
- if (erp_next->er_extcount == 0) {
- /*
- * Free page before removing extent record
- * so er_extoffs don't get modified in
- * xfs_iext_irec_remove.
- */
- kmem_free(erp_next->er_extbuf,
- erp_next->er_extcount * sizeof(xfs_bmbt_rec_t));
+ kmem_free(erp_next->er_extbuf);
erp_next->er_extbuf = NULL;
xfs_iext_irec_remove(ifp, erp_idx + 1);
- erp = &ifp->if_u1.if_ext_irec[erp_idx];
nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
- /* Update next page */
} else {
- /* Move rest of page up to become next new page */
- memmove(erp_next->er_extbuf, ep_next,
- erp_next->er_extcount * sizeof(xfs_bmbt_rec_t));
- ep_next = erp_next->er_extbuf;
- memset(&ep_next[erp_next->er_extcount], 0,
- (XFS_LINEAR_EXTS - erp_next->er_extcount) *
- sizeof(xfs_bmbt_rec_t));
- }
- if (erp->er_extcount == XFS_LINEAR_EXTS) {
erp_idx++;
- if (erp_idx < nlists)
- erp = &ifp->if_u1.if_ext_irec[erp_idx];
- else
- break;
}
- ep = &erp->er_extbuf[erp->er_extcount];
- erp_next = erp + 1;
- ep_next = erp_next->er_extbuf;
}
}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 0a999fee4f0..1420c49674d 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -87,8 +87,7 @@ typedef struct xfs_ifork {
* Flags for xfs_ichgtime().
*/
#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */
-#define XFS_ICHGTIME_ACC 0x2 /* data fork access timestamp */
-#define XFS_ICHGTIME_CHG 0x4 /* inode field change timestamp */
+#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */
/*
* Per-fork incore inode flags.
@@ -204,7 +203,7 @@ typedef struct xfs_inode {
struct xfs_inode *i_mprev; /* ptr to prev inode */
struct xfs_mount *i_mount; /* fs mount struct ptr */
struct list_head i_reclaim; /* reclaim list */
- bhv_vnode_t *i_vnode; /* vnode backpointer */
+ struct inode *i_vnode; /* vnode backpointer */
struct xfs_dquot *i_udquot; /* user dquot */
struct xfs_dquot *i_gdquot; /* group dquot */
@@ -223,7 +222,7 @@ typedef struct xfs_inode {
struct xfs_inode_log_item *i_itemp; /* logging information */
mrlock_t i_lock; /* inode lock */
mrlock_t i_iolock; /* inode IO lock */
- sema_t i_flock; /* inode flush lock */
+ struct completion i_flush; /* inode flush completion q */
atomic_t i_pincount; /* inode pin count */
wait_queue_head_t i_ipin_wait; /* inode pinning wait queue */
spinlock_t i_flags_lock; /* inode i_flags lock */
@@ -263,6 +262,18 @@ typedef struct xfs_inode {
#define XFS_ISIZE(ip) (((ip)->i_d.di_mode & S_IFMT) == S_IFREG) ? \
(ip)->i_size : (ip)->i_d.di_size;
+/* Convert from vfs inode to xfs inode */
+static inline struct xfs_inode *XFS_I(struct inode *inode)
+{
+ return (struct xfs_inode *)inode->i_private;
+}
+
+/* convert from xfs inode to vfs inode */
+static inline struct inode *VFS_I(struct xfs_inode *ip)
+{
+ return (struct inode *)ip->i_vnode;
+}
+
/*
* i_flags helper functions
*/
@@ -439,9 +450,6 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
#define XFS_ITRUNC_DEFINITE 0x1
#define XFS_ITRUNC_MAYBE 0x2
-#define XFS_ITOV(ip) ((ip)->i_vnode)
-#define XFS_ITOV_NULL(ip) ((ip)->i_vnode)
-
/*
* For multiple groups support: if S_ISGID bit is set in the parent
* directory, group of new file is set to that of the parent, and
@@ -473,11 +481,8 @@ int xfs_ilock_nowait(xfs_inode_t *, uint);
void xfs_iunlock(xfs_inode_t *, uint);
void xfs_ilock_demote(xfs_inode_t *, uint);
int xfs_isilocked(xfs_inode_t *, uint);
-void xfs_iflock(xfs_inode_t *);
-int xfs_iflock_nowait(xfs_inode_t *);
uint xfs_ilock_map_shared(xfs_inode_t *);
void xfs_iunlock_map_shared(xfs_inode_t *, uint);
-void xfs_ifunlock(xfs_inode_t *);
void xfs_ireclaim(xfs_inode_t *);
int xfs_finish_reclaim(xfs_inode_t *, int, int);
int xfs_finish_reclaim_all(struct xfs_mount *, int);
@@ -507,9 +512,6 @@ int xfs_itruncate_start(xfs_inode_t *, uint, xfs_fsize_t);
int xfs_itruncate_finish(struct xfs_trans **, xfs_inode_t *,
xfs_fsize_t, int, int);
int xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
-int xfs_igrow_start(xfs_inode_t *, xfs_fsize_t, struct cred *);
-void xfs_igrow_finish(struct xfs_trans *, xfs_inode_t *,
- xfs_fsize_t, int);
void xfs_idestroy_fork(xfs_inode_t *, int);
void xfs_idestroy(xfs_inode_t *);
@@ -525,6 +527,7 @@ void xfs_iflush_all(struct xfs_mount *);
void xfs_ichgtime(xfs_inode_t *, int);
xfs_fsize_t xfs_file_last_byte(xfs_inode_t *);
void xfs_lock_inodes(xfs_inode_t **, int, uint);
+void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
void xfs_synchronize_atime(xfs_inode_t *);
void xfs_mark_inode_dirty_sync(xfs_inode_t *);
@@ -573,6 +576,26 @@ extern struct kmem_zone *xfs_ifork_zone;
extern struct kmem_zone *xfs_inode_zone;
extern struct kmem_zone *xfs_ili_zone;
+/*
+ * Manage the i_flush queue embedded in the inode. This completion
+ * queue synchronizes processes attempting to flush the in-core
+ * inode back to disk.
+ */
+static inline void xfs_iflock(xfs_inode_t *ip)
+{
+ wait_for_completion(&ip->i_flush);
+}
+
+static inline int xfs_iflock_nowait(xfs_inode_t *ip)
+{
+ return try_wait_for_completion(&ip->i_flush);
+}
+
+static inline void xfs_ifunlock(xfs_inode_t *ip)
+{
+ complete(&ip->i_flush);
+}
+
#endif /* __KERNEL__ */
#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 167b33f1577..97c7452e262 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -686,7 +686,7 @@ xfs_inode_item_unlock(
ASSERT(ip->i_d.di_nextents > 0);
ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_DEXT);
ASSERT(ip->i_df.if_bytes > 0);
- kmem_free(iip->ili_extents_buf, ip->i_df.if_bytes);
+ kmem_free(iip->ili_extents_buf);
iip->ili_extents_buf = NULL;
}
if (iip->ili_aextents_buf != NULL) {
@@ -694,7 +694,7 @@ xfs_inode_item_unlock(
ASSERT(ip->i_d.di_anextents > 0);
ASSERT(iip->ili_format.ilf_fields & XFS_ILOG_AEXT);
ASSERT(ip->i_afp->if_bytes > 0);
- kmem_free(iip->ili_aextents_buf, ip->i_afp->if_bytes);
+ kmem_free(iip->ili_aextents_buf);
iip->ili_aextents_buf = NULL;
}
@@ -779,11 +779,10 @@ xfs_inode_item_pushbuf(
ASSERT(iip->ili_push_owner == current_pid());
/*
- * If flushlock isn't locked anymore, chances are that the
- * inode flush completed and the inode was taken off the AIL.
- * So, just get out.
+ * If a flush is not in progress anymore, chances are that the
+ * inode was taken off the AIL. So, just get out.
*/
- if (!issemalocked(&(ip->i_flock)) ||
+ if (completion_done(&ip->i_flush) ||
((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) {
iip->ili_pushbuf_flag = 0;
xfs_iunlock(ip, XFS_ILOCK_SHARED);
@@ -805,7 +804,7 @@ xfs_inode_item_pushbuf(
* If not, we can flush it async.
*/
dopush = ((iip->ili_item.li_flags & XFS_LI_IN_AIL) &&
- issemalocked(&(ip->i_flock)));
+ !completion_done(&ip->i_flush));
iip->ili_pushbuf_flag = 0;
xfs_iunlock(ip, XFS_ILOCK_SHARED);
xfs_buftrace("INODE ITEM PUSH", bp);
@@ -858,7 +857,7 @@ xfs_inode_item_push(
ip = iip->ili_inode;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
- ASSERT(issemalocked(&(ip->i_flock)));
+ ASSERT(!completion_done(&ip->i_flush));
/*
* Since we were able to lock the inode's flush lock and
* we found it on the AIL, the inode must be dirty. This
@@ -957,8 +956,7 @@ xfs_inode_item_destroy(
{
#ifdef XFS_TRANS_DEBUG
if (ip->i_itemp->ili_root_size != 0) {
- kmem_free(ip->i_itemp->ili_orig_root,
- ip->i_itemp->ili_root_size);
+ kmem_free(ip->i_itemp->ili_orig_root);
}
#endif
kmem_zone_free(xfs_ili_zone, ip->i_itemp);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 7edcde691d1..67f22b2b44b 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -889,6 +889,16 @@ xfs_iomap_write_unwritten(
count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
count_fsb = (xfs_filblks_t)(count_fsb - offset_fsb);
+ /*
+ * Reserve enough blocks in this transaction for two complete extent
+ * btree splits. We may be converting the middle part of an unwritten
+ * extent and in this case we will insert two new extents in the btree
+ * each of which could cause a full split.
+ *
+ * This reservation amount will be used in the first call to
+ * xfs_bmbt_split() to select an AG with enough space to satisfy the
+ * rest of the operation.
+ */
resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
do {
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 419de15aeb4..cf6754a3c5b 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -59,7 +59,6 @@ xfs_bulkstat_one_iget(
{
xfs_icdinode_t *dic; /* dinode core info pointer */
xfs_inode_t *ip; /* incore inode pointer */
- bhv_vnode_t *vp;
int error;
error = xfs_iget(mp, NULL, ino,
@@ -72,7 +71,6 @@ xfs_bulkstat_one_iget(
ASSERT(ip != NULL);
ASSERT(ip->i_blkno != (xfs_daddr_t)0);
- vp = XFS_ITOV(ip);
dic = &ip->i_d;
/* xfs_iget returns the following without needing
@@ -85,7 +83,7 @@ xfs_bulkstat_one_iget(
buf->bs_uid = dic->di_uid;
buf->bs_gid = dic->di_gid;
buf->bs_size = dic->di_size;
- vn_atime_to_bstime(vp, &buf->bs_atime);
+ vn_atime_to_bstime(VFS_I(ip), &buf->bs_atime);
buf->bs_mtime.tv_sec = dic->di_mtime.t_sec;
buf->bs_mtime.tv_nsec = dic->di_mtime.t_nsec;
buf->bs_ctime.tv_sec = dic->di_ctime.t_sec;
@@ -257,7 +255,7 @@ xfs_bulkstat_one(
*ubused = error;
out_free:
- kmem_free(buf, sizeof(*buf));
+ kmem_free(buf);
return error;
}
@@ -708,7 +706,7 @@ xfs_bulkstat(
/*
* Done, we're either out of filesystem or space to put the data.
*/
- kmem_free(irbuf, irbsize);
+ kmem_free(irbuf);
*ubcountp = ubelem;
/*
* Found some inodes, return them now and return the error next time.
@@ -914,7 +912,7 @@ xfs_inumbers(
}
*lastino = XFS_AGINO_TO_INO(mp, agno, agino);
}
- kmem_free(buffer, bcount * sizeof(*buffer));
+ kmem_free(buffer);
if (cur)
xfs_btree_del_cursor(cur, (error ? XFS_BTREE_ERROR :
XFS_BTREE_NOERROR));
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index afaee301b0e..0b02c644355 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -124,16 +124,27 @@ STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog,
STATIC int xlog_iclogs_empty(xlog_t *log);
#if defined(XFS_LOG_TRACE)
+
+#define XLOG_TRACE_LOGGRANT_SIZE 2048
+#define XLOG_TRACE_ICLOG_SIZE 256
+
+void
+xlog_trace_loggrant_alloc(xlog_t *log)
+{
+ log->l_grant_trace = ktrace_alloc(XLOG_TRACE_LOGGRANT_SIZE, KM_NOFS);
+}
+
+void
+xlog_trace_loggrant_dealloc(xlog_t *log)
+{
+ ktrace_free(log->l_grant_trace);
+}
+
void
xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string)
{
unsigned long cnts;
- if (!log->l_grant_trace) {
- log->l_grant_trace = ktrace_alloc(2048, KM_NOSLEEP);
- if (!log->l_grant_trace)
- return;
- }
/* ticket counts are 1 byte each */
cnts = ((unsigned long)tic->t_ocnt) | ((unsigned long)tic->t_cnt) << 8;
@@ -157,10 +168,20 @@ xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string)
}
void
+xlog_trace_iclog_alloc(xlog_in_core_t *iclog)
+{
+ iclog->ic_trace = ktrace_alloc(XLOG_TRACE_ICLOG_SIZE, KM_NOFS);
+}
+
+void
+xlog_trace_iclog_dealloc(xlog_in_core_t *iclog)
+{
+ ktrace_free(iclog->ic_trace);
+}
+
+void
xlog_trace_iclog(xlog_in_core_t *iclog, uint state)
{
- if (!iclog->ic_trace)
- iclog->ic_trace = ktrace_alloc(256, KM_SLEEP);
ktrace_enter(iclog->ic_trace,
(void *)((unsigned long)state),
(void *)((unsigned long)current_pid()),
@@ -170,8 +191,15 @@ xlog_trace_iclog(xlog_in_core_t *iclog, uint state)
(void *)NULL, (void *)NULL);
}
#else
+
+#define xlog_trace_loggrant_alloc(log)
+#define xlog_trace_loggrant_dealloc(log)
#define xlog_trace_loggrant(log,tic,string)
+
+#define xlog_trace_iclog_alloc(iclog)
+#define xlog_trace_iclog_dealloc(iclog)
#define xlog_trace_iclog(iclog,state)
+
#endif /* XFS_LOG_TRACE */
@@ -226,20 +254,24 @@ xlog_grant_sub_space(struct log *log, int bytes)
static void
xlog_grant_add_space_write(struct log *log, int bytes)
{
- log->l_grant_write_bytes += bytes;
- if (log->l_grant_write_bytes > log->l_logsize) {
- log->l_grant_write_bytes -= log->l_logsize;
+ int tmp = log->l_logsize - log->l_grant_write_bytes;
+ if (tmp > bytes)
+ log->l_grant_write_bytes += bytes;
+ else {
log->l_grant_write_cycle++;
+ log->l_grant_write_bytes = bytes - tmp;
}
}
static void
xlog_grant_add_space_reserve(struct log *log, int bytes)
{
- log->l_grant_reserve_bytes += bytes;
- if (log->l_grant_reserve_bytes > log->l_logsize) {
- log->l_grant_reserve_bytes -= log->l_logsize;
+ int tmp = log->l_logsize - log->l_grant_reserve_bytes;
+ if (tmp > bytes)
+ log->l_grant_reserve_bytes += bytes;
+ else {
log->l_grant_reserve_cycle++;
+ log->l_grant_reserve_bytes = bytes - tmp;
}
}
@@ -332,15 +364,12 @@ xfs_log_done(xfs_mount_t *mp,
} else {
xlog_trace_loggrant(log, ticket, "xfs_log_done: (permanent)");
xlog_regrant_reserve_log_space(log, ticket);
- }
-
- /* If this ticket was a permanent reservation and we aren't
- * trying to release it, reset the inited flags; so next time
- * we write, a start record will be written out.
- */
- if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) &&
- (flags & XFS_LOG_REL_PERM_RESERV) == 0)
+ /* If this ticket was a permanent reservation and we aren't
+ * trying to release it, reset the inited flags; so next time
+ * we write, a start record will be written out.
+ */
ticket->t_flags |= XLOG_TIC_INITED;
+ }
return lsn;
} /* xfs_log_done */
@@ -353,11 +382,11 @@ xfs_log_done(xfs_mount_t *mp,
* Asynchronous forces are implemented by setting the WANT_SYNC
* bit in the appropriate in-core log and then returning.
*
- * Synchronous forces are implemented with a semaphore. All callers
- * to force a given lsn to disk will wait on a semaphore attached to the
+ * Synchronous forces are implemented with a signal variable. All callers
+ * to force a given lsn to disk will wait on a the sv attached to the
* specific in-core log. When given in-core log finally completes its
* write to disk, that thread will wake up all threads waiting on the
- * semaphore.
+ * sv.
*/
int
_xfs_log_force(
@@ -584,12 +613,12 @@ error:
* mp - ubiquitous xfs mount point structure
*/
int
-xfs_log_mount_finish(xfs_mount_t *mp, int mfsi_flags)
+xfs_log_mount_finish(xfs_mount_t *mp)
{
int error;
if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
- error = xlog_recover_finish(mp->m_log, mfsi_flags);
+ error = xlog_recover_finish(mp->m_log);
else {
error = 0;
ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
@@ -703,7 +732,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
if (!(iclog->ic_state == XLOG_STATE_ACTIVE ||
iclog->ic_state == XLOG_STATE_DIRTY)) {
if (!XLOG_FORCED_SHUTDOWN(log)) {
- sv_wait(&iclog->ic_forcesema, PMEM,
+ sv_wait(&iclog->ic_force_wait, PMEM,
&log->l_icloglock, s);
} else {
spin_unlock(&log->l_icloglock);
@@ -744,7 +773,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
|| iclog->ic_state == XLOG_STATE_DIRTY
|| iclog->ic_state == XLOG_STATE_IOERROR) ) {
- sv_wait(&iclog->ic_forcesema, PMEM,
+ sv_wait(&iclog->ic_force_wait, PMEM,
&log->l_icloglock, s);
} else {
spin_unlock(&log->l_icloglock);
@@ -834,7 +863,7 @@ xfs_log_move_tail(xfs_mount_t *mp,
break;
tail_lsn = 0;
free_bytes -= tic->t_unit_res;
- sv_signal(&tic->t_sema);
+ sv_signal(&tic->t_wait);
tic = tic->t_next;
} while (tic != log->l_write_headq);
}
@@ -855,7 +884,7 @@ xfs_log_move_tail(xfs_mount_t *mp,
break;
tail_lsn = 0;
free_bytes -= need_bytes;
- sv_signal(&tic->t_sema);
+ sv_signal(&tic->t_wait);
tic = tic->t_next;
} while (tic != log->l_reserve_headq);
}
@@ -1004,11 +1033,12 @@ xlog_iodone(xfs_buf_t *bp)
l = iclog->ic_log;
/*
- * If the ordered flag has been removed by a lower
- * layer, it means the underlyin device no longer supports
+ * If the _XFS_BARRIER_FAILED flag was set by a lower
+ * layer, it means the underlying device no longer supports
* barrier I/O. Warn loudly and turn off barriers.
*/
- if ((l->l_mp->m_flags & XFS_MOUNT_BARRIER) && !XFS_BUF_ORDERED(bp)) {
+ if (bp->b_flags & _XFS_BARRIER_FAILED) {
+ bp->b_flags &= ~_XFS_BARRIER_FAILED;
l->l_mp->m_flags &= ~XFS_MOUNT_BARRIER;
xfs_fs_cmn_err(CE_WARN, l->l_mp,
"xlog_iodone: Barriers are no longer supported"
@@ -1228,8 +1258,9 @@ xlog_alloc_log(xfs_mount_t *mp,
spin_lock_init(&log->l_icloglock);
spin_lock_init(&log->l_grant_lock);
- initnsema(&log->l_flushsema, 0, "ic-flush");
+ sv_init(&log->l_flush_wait, 0, "flush_wait");
+ xlog_trace_loggrant_alloc(log);
/* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
@@ -1281,8 +1312,10 @@ xlog_alloc_log(xfs_mount_t *mp,
ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0);
- sv_init(&iclog->ic_forcesema, SV_DEFAULT, "iclog-force");
- sv_init(&iclog->ic_writesema, SV_DEFAULT, "iclog-write");
+ sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force");
+ sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write");
+
+ xlog_trace_iclog_alloc(iclog);
iclogp = &iclog->ic_next;
}
@@ -1561,33 +1594,21 @@ xlog_dealloc_log(xlog_t *log)
iclog = log->l_iclog;
for (i=0; i<log->l_iclog_bufs; i++) {
- sv_destroy(&iclog->ic_forcesema);
- sv_destroy(&iclog->ic_writesema);
+ sv_destroy(&iclog->ic_force_wait);
+ sv_destroy(&iclog->ic_write_wait);
xfs_buf_free(iclog->ic_bp);
-#ifdef XFS_LOG_TRACE
- if (iclog->ic_trace != NULL) {
- ktrace_free(iclog->ic_trace);
- }
-#endif
+ xlog_trace_iclog_dealloc(iclog);
next_iclog = iclog->ic_next;
- kmem_free(iclog, sizeof(xlog_in_core_t));
+ kmem_free(iclog);
iclog = next_iclog;
}
- freesema(&log->l_flushsema);
spinlock_destroy(&log->l_icloglock);
spinlock_destroy(&log->l_grant_lock);
xfs_buf_free(log->l_xbuf);
-#ifdef XFS_LOG_TRACE
- if (log->l_trace != NULL) {
- ktrace_free(log->l_trace);
- }
- if (log->l_grant_trace != NULL) {
- ktrace_free(log->l_grant_trace);
- }
-#endif
+ xlog_trace_loggrant_dealloc(log);
log->l_mp->m_log = NULL;
- kmem_free(log, sizeof(xlog_t));
+ kmem_free(log);
} /* xlog_dealloc_log */
/*
@@ -1973,7 +1994,7 @@ xlog_write(xfs_mount_t * mp,
/* Clean iclogs starting from the head. This ordering must be
* maintained, so an iclog doesn't become ACTIVE beyond one that
* is SYNCING. This is also required to maintain the notion that we use
- * a counting semaphore to hold off would be writers to the log when every
+ * a ordered wait queue to hold off would be writers to the log when every
* iclog is trying to sync to disk.
*
* State Change: DIRTY -> ACTIVE
@@ -2097,6 +2118,7 @@ xlog_state_do_callback(
int funcdidcallbacks; /* flag: function did callbacks */
int repeats; /* for issuing console warnings if
* looping too many times */
+ int wake = 0;
spin_lock(&log->l_icloglock);
first_iclog = iclog = log->l_iclog;
@@ -2236,7 +2258,7 @@ xlog_state_do_callback(
xlog_state_clean_log(log);
/* wake up threads waiting in xfs_log_force() */
- sv_broadcast(&iclog->ic_forcesema);
+ sv_broadcast(&iclog->ic_force_wait);
iclog = iclog->ic_next;
} while (first_iclog != iclog);
@@ -2278,15 +2300,13 @@ xlog_state_do_callback(
}
#endif
- flushcnt = 0;
- if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR)) {
- flushcnt = log->l_flushcnt;
- log->l_flushcnt = 0;
- }
+ if (log->l_iclog->ic_state & (XLOG_STATE_ACTIVE|XLOG_STATE_IOERROR))
+ wake = 1;
spin_unlock(&log->l_icloglock);
- while (flushcnt--)
- vsema(&log->l_flushsema);
-} /* xlog_state_do_callback */
+
+ if (wake)
+ sv_broadcast(&log->l_flush_wait);
+}
/*
@@ -2300,8 +2320,7 @@ xlog_state_do_callback(
* the second completion goes through.
*
* Callbacks could take time, so they are done outside the scope of the
- * global state machine log lock. Assume that the calls to cvsema won't
- * take a long time. At least we know it won't sleep.
+ * global state machine log lock.
*/
STATIC void
xlog_state_done_syncing(
@@ -2337,7 +2356,7 @@ xlog_state_done_syncing(
* iclog buffer, we wake them all, one will get to do the
* I/O, the others get to wait for the result.
*/
- sv_broadcast(&iclog->ic_writesema);
+ sv_broadcast(&iclog->ic_write_wait);
spin_unlock(&log->l_icloglock);
xlog_state_do_callback(log, aborted, iclog); /* also cleans log */
} /* xlog_state_done_syncing */
@@ -2345,11 +2364,9 @@ xlog_state_done_syncing(
/*
* If the head of the in-core log ring is not (ACTIVE or DIRTY), then we must
- * sleep. The flush semaphore is set to the number of in-core buffers and
- * decremented around disk syncing. Therefore, if all buffers are syncing,
- * this semaphore will cause new writes to sleep until a sync completes.
- * Otherwise, this code just does p() followed by v(). This approximates
- * a sleep/wakeup except we can't race.
+ * sleep. We wait on the flush queue on the head iclog as that should be
+ * the first iclog to complete flushing. Hence if all iclogs are syncing,
+ * we will wait here and all new writes will sleep until a sync completes.
*
* The in-core logs are used in a circular fashion. They are not used
* out-of-order even when an iclog past the head is free.
@@ -2384,16 +2401,15 @@ restart:
}
iclog = log->l_iclog;
- if (! (iclog->ic_state == XLOG_STATE_ACTIVE)) {
- log->l_flushcnt++;
- spin_unlock(&log->l_icloglock);
+ if (iclog->ic_state != XLOG_STATE_ACTIVE) {
xlog_trace_iclog(iclog, XLOG_TRACE_SLEEP_FLUSH);
XFS_STATS_INC(xs_log_noiclogs);
- /* Ensure that log writes happen */
- psema(&log->l_flushsema, PINOD);
+
+ /* Wait for log writes to have flushed */
+ sv_wait(&log->l_flush_wait, 0, &log->l_icloglock, 0);
goto restart;
}
- ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
+
head = &iclog->ic_header;
atomic_inc(&iclog->ic_refcnt); /* prevents sync */
@@ -2427,13 +2443,20 @@ restart:
if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) {
xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
- /* If I'm the only one writing to this iclog, sync it to disk */
- if (atomic_read(&iclog->ic_refcnt) == 1) {
+ /*
+ * If I'm the only one writing to this iclog, sync it to disk.
+ * We need to do an atomic compare and decrement here to avoid
+ * racing with concurrent atomic_dec_and_lock() calls in
+ * xlog_state_release_iclog() when there is more than one
+ * reference to the iclog.
+ */
+ if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) {
+ /* we are the only one */
spin_unlock(&log->l_icloglock);
- if ((error = xlog_state_release_iclog(log, iclog)))
+ error = xlog_state_release_iclog(log, iclog);
+ if (error)
return error;
} else {
- atomic_dec(&iclog->ic_refcnt);
spin_unlock(&log->l_icloglock);
}
goto restart;
@@ -2500,7 +2523,7 @@ xlog_grant_log_space(xlog_t *log,
goto error_return;
XFS_STATS_INC(xs_sleep_logspace);
- sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s);
+ sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
/*
* If we got an error, and the filesystem is shutting down,
* we'll catch it down below. So just continue...
@@ -2526,7 +2549,7 @@ redo:
xlog_trace_loggrant(log, tic,
"xlog_grant_log_space: sleep 2");
XFS_STATS_INC(xs_sleep_logspace);
- sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s);
+ sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
if (XLOG_FORCED_SHUTDOWN(log)) {
spin_lock(&log->l_grant_lock);
@@ -2625,7 +2648,7 @@ xlog_regrant_write_log_space(xlog_t *log,
if (free_bytes < ntic->t_unit_res)
break;
free_bytes -= ntic->t_unit_res;
- sv_signal(&ntic->t_sema);
+ sv_signal(&ntic->t_wait);
ntic = ntic->t_next;
} while (ntic != log->l_write_headq);
@@ -2636,7 +2659,7 @@ xlog_regrant_write_log_space(xlog_t *log,
xlog_trace_loggrant(log, tic,
"xlog_regrant_write_log_space: sleep 1");
XFS_STATS_INC(xs_sleep_logspace);
- sv_wait(&tic->t_sema, PINOD|PLTWAIT,
+ sv_wait(&tic->t_wait, PINOD|PLTWAIT,
&log->l_grant_lock, s);
/* If we're shutting down, this tic is already
@@ -2665,7 +2688,7 @@ redo:
if ((tic->t_flags & XLOG_TIC_IN_Q) == 0)
xlog_ins_ticketq(&log->l_write_headq, tic);
XFS_STATS_INC(xs_sleep_logspace);
- sv_wait(&tic->t_sema, PINOD|PLTWAIT, &log->l_grant_lock, s);
+ sv_wait(&tic->t_wait, PINOD|PLTWAIT, &log->l_grant_lock, s);
/* If we're shutting down, this tic is already off the queue */
if (XLOG_FORCED_SHUTDOWN(log)) {
@@ -2908,7 +2931,7 @@ xlog_state_switch_iclogs(xlog_t *log,
* 2. the current iclog is drity, and the previous iclog is in the
* active or dirty state.
*
- * We may sleep (call psema) if:
+ * We may sleep if:
*
* 1. the current iclog is not in the active nor dirty state.
* 2. the current iclog dirty, and the previous iclog is not in the
@@ -3005,7 +3028,7 @@ maybe_sleep:
return XFS_ERROR(EIO);
}
XFS_STATS_INC(xs_log_force_sleep);
- sv_wait(&iclog->ic_forcesema, PINOD, &log->l_icloglock, s);
+ sv_wait(&iclog->ic_force_wait, PINOD, &log->l_icloglock, s);
/*
* No need to grab the log lock here since we're
* only deciding whether or not to return EIO
@@ -3088,7 +3111,7 @@ try_again:
XLOG_STATE_SYNCING))) {
ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR));
XFS_STATS_INC(xs_log_force_sleep);
- sv_wait(&iclog->ic_prev->ic_writesema, PSWP,
+ sv_wait(&iclog->ic_prev->ic_write_wait, PSWP,
&log->l_icloglock, s);
*log_flushed = 1;
already_slept = 1;
@@ -3108,7 +3131,7 @@ try_again:
!(iclog->ic_state & (XLOG_STATE_ACTIVE | XLOG_STATE_DIRTY))) {
/*
- * Don't wait on the forcesema if we know that we've
+ * Don't wait on completion if we know that we've
* gotten a log write error.
*/
if (iclog->ic_state & XLOG_STATE_IOERROR) {
@@ -3116,7 +3139,7 @@ try_again:
return XFS_ERROR(EIO);
}
XFS_STATS_INC(xs_log_force_sleep);
- sv_wait(&iclog->ic_forcesema, PSWP, &log->l_icloglock, s);
+ sv_wait(&iclog->ic_force_wait, PSWP, &log->l_icloglock, s);
/*
* No need to grab the log lock here since we're
* only deciding whether or not to return EIO
@@ -3172,7 +3195,7 @@ STATIC void
xlog_ticket_put(xlog_t *log,
xlog_ticket_t *ticket)
{
- sv_destroy(&ticket->t_sema);
+ sv_destroy(&ticket->t_wait);
kmem_zone_free(xfs_log_ticket_zone, ticket);
} /* xlog_ticket_put */
@@ -3262,7 +3285,7 @@ xlog_ticket_get(xlog_t *log,
tic->t_trans_type = 0;
if (xflags & XFS_LOG_PERM_RESERV)
tic->t_flags |= XLOG_TIC_PERM_RESERV;
- sv_init(&(tic->t_sema), SV_DEFAULT, "logtick");
+ sv_init(&(tic->t_wait), SV_DEFAULT, "logtick");
xlog_tic_reset_res(tic);
@@ -3549,14 +3572,14 @@ xfs_log_force_umount(
*/
if ((tic = log->l_reserve_headq)) {
do {
- sv_signal(&tic->t_sema);
+ sv_signal(&tic->t_wait);
tic = tic->t_next;
} while (tic != log->l_reserve_headq);
}
if ((tic = log->l_write_headq)) {
do {
- sv_signal(&tic->t_sema);
+ sv_signal(&tic->t_wait);
tic = tic->t_next;
} while (tic != log->l_write_headq);
}
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index d1d678ecb63..d47b91f1082 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -149,7 +149,7 @@ int xfs_log_mount(struct xfs_mount *mp,
struct xfs_buftarg *log_target,
xfs_daddr_t start_block,
int num_bblocks);
-int xfs_log_mount_finish(struct xfs_mount *mp, int);
+int xfs_log_mount_finish(struct xfs_mount *mp);
void xfs_log_move_tail(struct xfs_mount *mp,
xfs_lsn_t tail_lsn);
int xfs_log_notify(struct xfs_mount *mp,
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 8952a392b5f..e7d8f84443f 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -241,7 +241,7 @@ typedef struct xlog_res {
} xlog_res_t;
typedef struct xlog_ticket {
- sv_t t_sema; /* sleep on this semaphore : 20 */
+ sv_t t_wait; /* ticket wait queue : 20 */
struct xlog_ticket *t_next; /* :4|8 */
struct xlog_ticket *t_prev; /* :4|8 */
xlog_tid_t t_tid; /* transaction identifier : 4 */
@@ -314,7 +314,7 @@ typedef struct xlog_rec_ext_header {
* xlog_rec_header_t into the reserved space.
* - ic_data follows, so a write to disk can start at the beginning of
* the iclog.
- * - ic_forcesema is used to implement synchronous forcing of the iclog to disk.
+ * - ic_forcewait is used to implement synchronous forcing of the iclog to disk.
* - ic_next is the pointer to the next iclog in the ring.
* - ic_bp is a pointer to the buffer used to write this incore log to disk.
* - ic_log is a pointer back to the global log structure.
@@ -339,8 +339,8 @@ typedef struct xlog_rec_ext_header {
* and move everything else out to subsequent cachelines.
*/
typedef struct xlog_iclog_fields {
- sv_t ic_forcesema;
- sv_t ic_writesema;
+ sv_t ic_force_wait;
+ sv_t ic_write_wait;
struct xlog_in_core *ic_next;
struct xlog_in_core *ic_prev;
struct xfs_buf *ic_bp;
@@ -377,8 +377,8 @@ typedef struct xlog_in_core {
/*
* Defines to save our code from this glop.
*/
-#define ic_forcesema hic_fields.ic_forcesema
-#define ic_writesema hic_fields.ic_writesema
+#define ic_force_wait hic_fields.ic_force_wait
+#define ic_write_wait hic_fields.ic_write_wait
#define ic_next hic_fields.ic_next
#define ic_prev hic_fields.ic_prev
#define ic_bp hic_fields.ic_bp
@@ -423,10 +423,8 @@ typedef struct log {
int l_logBBsize; /* size of log in BB chunks */
/* The following block of fields are changed while holding icloglock */
- sema_t l_flushsema ____cacheline_aligned_in_smp;
- /* iclog flushing semaphore */
- int l_flushcnt; /* # of procs waiting on this
- * sema */
+ sv_t l_flush_wait ____cacheline_aligned_in_smp;
+ /* waiting for iclog flush */
int l_covered_state;/* state of "covering disk
* log entries" */
xlog_in_core_t *l_iclog; /* head log queue */
@@ -450,7 +448,6 @@ typedef struct log {
int l_grant_write_bytes;
#ifdef XFS_LOG_TRACE
- struct ktrace *l_trace;
struct ktrace *l_grant_trace;
#endif
@@ -470,7 +467,7 @@ extern int xlog_find_tail(xlog_t *log,
xfs_daddr_t *head_blk,
xfs_daddr_t *tail_blk);
extern int xlog_recover(xlog_t *log);
-extern int xlog_recover_finish(xlog_t *log, int mfsi_flags);
+extern int xlog_recover_finish(xlog_t *log);
extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
extern void xlog_recover_process_iunlinks(xlog_t *log);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index e65ab4af095..82d46ce69d5 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1715,8 +1715,7 @@ xlog_check_buffer_cancelled(
} else {
prevp->bc_next = bcp->bc_next;
}
- kmem_free(bcp,
- sizeof(xfs_buf_cancel_t));
+ kmem_free(bcp);
}
}
return 1;
@@ -2519,7 +2518,7 @@ write_inode_buffer:
error:
if (need_free)
- kmem_free(in_f, sizeof(*in_f));
+ kmem_free(in_f);
return XFS_ERROR(error);
}
@@ -2830,16 +2829,14 @@ xlog_recover_free_trans(
item = item->ri_next;
/* Free the regions in the item. */
for (i = 0; i < free_item->ri_cnt; i++) {
- kmem_free(free_item->ri_buf[i].i_addr,
- free_item->ri_buf[i].i_len);
+ kmem_free(free_item->ri_buf[i].i_addr);
}
/* Free the item itself */
- kmem_free(free_item->ri_buf,
- (free_item->ri_total * sizeof(xfs_log_iovec_t)));
- kmem_free(free_item, sizeof(xlog_recover_item_t));
+ kmem_free(free_item->ri_buf);
+ kmem_free(free_item);
} while (first_item != item);
/* Free the transaction recover structure */
- kmem_free(trans, sizeof(xlog_recover_t));
+ kmem_free(trans);
}
STATIC int
@@ -3786,8 +3783,7 @@ xlog_do_log_recovery(
error = xlog_do_recovery_pass(log, head_blk, tail_blk,
XLOG_RECOVER_PASS1);
if (error != 0) {
- kmem_free(log->l_buf_cancel_table,
- XLOG_BC_TABLE_SIZE * sizeof(xfs_buf_cancel_t*));
+ kmem_free(log->l_buf_cancel_table);
log->l_buf_cancel_table = NULL;
return error;
}
@@ -3806,8 +3802,7 @@ xlog_do_log_recovery(
}
#endif /* DEBUG */
- kmem_free(log->l_buf_cancel_table,
- XLOG_BC_TABLE_SIZE * sizeof(xfs_buf_cancel_t*));
+ kmem_free(log->l_buf_cancel_table);
log->l_buf_cancel_table = NULL;
return error;
@@ -3945,8 +3940,7 @@ xlog_recover(
*/
int
xlog_recover_finish(
- xlog_t *log,
- int mfsi_flags)
+ xlog_t *log)
{
/*
* Now we're ready to do the transactions needed for the
@@ -3974,9 +3968,7 @@ xlog_recover_finish(
xfs_log_force(log->l_mp, (xfs_lsn_t)0,
(XFS_LOG_FORCE | XFS_LOG_SYNC));
- if ( (mfsi_flags & XFS_MFSI_NOUNLINK) == 0 ) {
- xlog_recover_process_iunlinks(log);
- }
+ xlog_recover_process_iunlinks(log);
xlog_recover_check_summary(log);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index da3988453b7..a4503f5e949 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -47,12 +47,10 @@
STATIC int xfs_mount_log_sb(xfs_mount_t *, __int64_t);
STATIC int xfs_uuid_mount(xfs_mount_t *);
-STATIC void xfs_uuid_unmount(xfs_mount_t *mp);
STATIC void xfs_unmountfs_wait(xfs_mount_t *);
#ifdef HAVE_PERCPU_SB
-STATIC void xfs_icsb_destroy_counters(xfs_mount_t *);
STATIC void xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t,
int);
STATIC void xfs_icsb_balance_counter_locked(xfs_mount_t *, xfs_sb_field_t,
@@ -63,7 +61,6 @@ STATIC void xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
#else
-#define xfs_icsb_destroy_counters(mp) do { } while (0)
#define xfs_icsb_balance_counter(mp, a, b) do { } while (0)
#define xfs_icsb_balance_counter_locked(mp, a, b) do { } while (0)
#define xfs_icsb_modify_counters(mp, a, b, c) do { } while (0)
@@ -126,34 +123,12 @@ static const struct {
};
/*
- * Return a pointer to an initialized xfs_mount structure.
- */
-xfs_mount_t *
-xfs_mount_init(void)
-{
- xfs_mount_t *mp;
-
- mp = kmem_zalloc(sizeof(xfs_mount_t), KM_SLEEP);
-
- if (xfs_icsb_init_counters(mp)) {
- mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB;
- }
-
- spin_lock_init(&mp->m_sb_lock);
- mutex_init(&mp->m_ilock);
- mutex_init(&mp->m_growlock);
- atomic_set(&mp->m_active_trans, 0);
-
- return mp;
-}
-
-/*
* Free up the resources associated with a mount structure. Assume that
* the structure was initially zeroed, so we can tell which fields got
* initialized.
*/
-void
-xfs_mount_free(
+STATIC void
+xfs_free_perag(
xfs_mount_t *mp)
{
if (mp->m_perag) {
@@ -161,28 +136,9 @@ xfs_mount_free(
for (agno = 0; agno < mp->m_maxagi; agno++)
if (mp->m_perag[agno].pagb_list)
- kmem_free(mp->m_perag[agno].pagb_list,
- sizeof(xfs_perag_busy_t) *
- XFS_PAGB_NUM_SLOTS);
- kmem_free(mp->m_perag,
- sizeof(xfs_perag_t) * mp->m_sb.sb_agcount);
+ kmem_free(mp->m_perag[agno].pagb_list);
+ kmem_free(mp->m_perag);
}
-
- spinlock_destroy(&mp->m_ail_lock);
- spinlock_destroy(&mp->m_sb_lock);
- mutex_destroy(&mp->m_ilock);
- mutex_destroy(&mp->m_growlock);
- if (mp->m_quotainfo)
- XFS_QM_DONE(mp);
-
- if (mp->m_fsname != NULL)
- kmem_free(mp->m_fsname, mp->m_fsname_len);
- if (mp->m_rtname != NULL)
- kmem_free(mp->m_rtname, strlen(mp->m_rtname) + 1);
- if (mp->m_logname != NULL)
- kmem_free(mp->m_logname, strlen(mp->m_logname) + 1);
-
- xfs_icsb_destroy_counters(mp);
}
/*
@@ -288,6 +244,19 @@ xfs_mount_validate_sb(
return XFS_ERROR(EFSCORRUPTED);
}
+ /*
+ * Until this is fixed only page-sized or smaller data blocks work.
+ */
+ if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
+ xfs_fs_mount_cmn_err(flags,
+ "file system with blocksize %d bytes",
+ sbp->sb_blocksize);
+ xfs_fs_mount_cmn_err(flags,
+ "only pagesize (%ld) or less will currently work.",
+ PAGE_SIZE);
+ return XFS_ERROR(ENOSYS);
+ }
+
if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
xfs_fs_mount_cmn_err(flags,
@@ -309,19 +278,6 @@ xfs_mount_validate_sb(
return XFS_ERROR(ENOSYS);
}
- /*
- * Until this is fixed only page-sized or smaller data blocks work.
- */
- if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
- xfs_fs_mount_cmn_err(flags,
- "file system with blocksize %d bytes",
- sbp->sb_blocksize);
- xfs_fs_mount_cmn_err(flags,
- "only pagesize (%ld) or less will currently work.",
- PAGE_SIZE);
- return XFS_ERROR(ENOSYS);
- }
-
return 0;
}
@@ -734,11 +690,11 @@ xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount)
* Update alignment values based on mount options and sb values
*/
STATIC int
-xfs_update_alignment(xfs_mount_t *mp, int mfsi_flags, __uint64_t *update_flags)
+xfs_update_alignment(xfs_mount_t *mp, __uint64_t *update_flags)
{
xfs_sb_t *sbp = &(mp->m_sb);
- if (mp->m_dalign && !(mfsi_flags & XFS_MFSI_SECOND)) {
+ if (mp->m_dalign) {
/*
* If stripe unit and stripe width are not multiples
* of the fs blocksize turn off alignment.
@@ -894,7 +850,7 @@ xfs_set_inoalignment(xfs_mount_t *mp)
* Check that the data (and log if separate) are an ok size.
*/
STATIC int
-xfs_check_sizes(xfs_mount_t *mp, int mfsi_flags)
+xfs_check_sizes(xfs_mount_t *mp)
{
xfs_buf_t *bp;
xfs_daddr_t d;
@@ -917,8 +873,7 @@ xfs_check_sizes(xfs_mount_t *mp, int mfsi_flags)
return error;
}
- if (((mfsi_flags & XFS_MFSI_CLIENT) == 0) &&
- mp->m_logdev_targp != mp->m_ddev_targp) {
+ if (mp->m_logdev_targp != mp->m_ddev_targp) {
d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
cmn_err(CE_WARN, "XFS: size check 3 failed");
@@ -953,15 +908,13 @@ xfs_check_sizes(xfs_mount_t *mp, int mfsi_flags)
*/
int
xfs_mountfs(
- xfs_mount_t *mp,
- int mfsi_flags)
+ xfs_mount_t *mp)
{
xfs_sb_t *sbp = &(mp->m_sb);
xfs_inode_t *rip;
__uint64_t resblks;
__int64_t update_flags = 0LL;
uint quotamount, quotaflags;
- int agno;
int uuid_mounted = 0;
int error = 0;
@@ -994,9 +947,19 @@ xfs_mountfs(
* Re-check for ATTR2 in case it was found in bad_features2
* slot.
*/
- if (xfs_sb_version_hasattr2(&mp->m_sb))
+ if (xfs_sb_version_hasattr2(&mp->m_sb) &&
+ !(mp->m_flags & XFS_MOUNT_NOATTR2))
mp->m_flags |= XFS_MOUNT_ATTR2;
+ }
+
+ if (xfs_sb_version_hasattr2(&mp->m_sb) &&
+ (mp->m_flags & XFS_MOUNT_NOATTR2)) {
+ xfs_sb_version_removeattr2(&mp->m_sb);
+ update_flags |= XFS_SB_FEATURES2;
+ /* update sb_versionnum for the clearing of the morebits */
+ if (!sbp->sb_features2)
+ update_flags |= XFS_SB_VERSIONNUM;
}
/*
@@ -1005,7 +968,7 @@ xfs_mountfs(
* allocator alignment is within an ag, therefore ag has
* to be aligned at stripe boundary.
*/
- error = xfs_update_alignment(mp, mfsi_flags, &update_flags);
+ error = xfs_update_alignment(mp, &update_flags);
if (error)
goto error1;
@@ -1024,8 +987,7 @@ xfs_mountfs(
* since a single partition filesystem is identical to a single
* partition volume/filesystem.
*/
- if ((mfsi_flags & XFS_MFSI_SECOND) == 0 &&
- (mp->m_flags & XFS_MOUNT_NOUUID) == 0) {
+ if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0) {
if (xfs_uuid_mount(mp)) {
error = XFS_ERROR(EINVAL);
goto error1;
@@ -1053,7 +1015,7 @@ xfs_mountfs(
/*
* Check that the data (and log if separate) are an ok size.
*/
- error = xfs_check_sizes(mp, mfsi_flags);
+ error = xfs_check_sizes(mp);
if (error)
goto error1;
@@ -1067,13 +1029,6 @@ xfs_mountfs(
}
/*
- * For client case we are done now
- */
- if (mfsi_flags & XFS_MFSI_CLIENT) {
- return 0;
- }
-
- /*
* Copies the low order bits of the timestamp and the randomly
* set "sequence" number out of a UUID.
*/
@@ -1097,8 +1052,10 @@ xfs_mountfs(
* Allocate and initialize the per-ag data.
*/
init_rwsem(&mp->m_peraglock);
- mp->m_perag =
- kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t), KM_SLEEP);
+ mp->m_perag = kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t),
+ KM_MAYFAIL);
+ if (!mp->m_perag)
+ goto error1;
mp->m_maxagi = xfs_initialize_perag(mp, sbp->sb_agcount);
@@ -1210,7 +1167,7 @@ xfs_mountfs(
* delayed until after the root and real-time bitmap inodes
* were consistently read in.
*/
- error = xfs_log_mount_finish(mp, mfsi_flags);
+ error = xfs_log_mount_finish(mp);
if (error) {
cmn_err(CE_WARN, "XFS: log mount finish failed");
goto error4;
@@ -1219,7 +1176,7 @@ xfs_mountfs(
/*
* Complete the quota initialisation, post-log-replay component.
*/
- error = XFS_QM_MOUNT(mp, quotamount, quotaflags, mfsi_flags);
+ error = XFS_QM_MOUNT(mp, quotamount, quotaflags);
if (error)
goto error4;
@@ -1253,31 +1210,25 @@ xfs_mountfs(
error3:
xfs_log_unmount_dealloc(mp);
error2:
- for (agno = 0; agno < sbp->sb_agcount; agno++)
- if (mp->m_perag[agno].pagb_list)
- kmem_free(mp->m_perag[agno].pagb_list,
- sizeof(xfs_perag_busy_t) * XFS_PAGB_NUM_SLOTS);
- kmem_free(mp->m_perag, sbp->sb_agcount * sizeof(xfs_perag_t));
- mp->m_perag = NULL;
- /* FALLTHROUGH */
+ xfs_free_perag(mp);
error1:
if (uuid_mounted)
- xfs_uuid_unmount(mp);
- xfs_freesb(mp);
+ uuid_table_remove(&mp->m_sb.sb_uuid);
return error;
}
/*
- * xfs_unmountfs
- *
* This flushes out the inodes,dquots and the superblock, unmounts the
* log and makes sure that incore structures are freed.
*/
-int
-xfs_unmountfs(xfs_mount_t *mp, struct cred *cr)
+void
+xfs_unmountfs(
+ struct xfs_mount *mp)
{
- __uint64_t resblks;
- int error = 0;
+ __uint64_t resblks;
+ int error;
+
+ IRELE(mp->m_rootip);
/*
* We can potentially deadlock here if we have an inode cluster
@@ -1334,32 +1285,20 @@ xfs_unmountfs(xfs_mount_t *mp, struct cred *cr)
xfs_unmountfs_wait(mp); /* wait for async bufs */
xfs_log_unmount(mp); /* Done! No more fs ops. */
- xfs_freesb(mp);
-
/*
* All inodes from this mount point should be freed.
*/
ASSERT(mp->m_inodes == NULL);
- xfs_unmountfs_close(mp, cr);
if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0)
- xfs_uuid_unmount(mp);
+ uuid_table_remove(&mp->m_sb.sb_uuid);
-#if defined(DEBUG) || defined(INDUCE_IO_ERROR)
+#if defined(DEBUG)
xfs_errortag_clearall(mp, 0);
#endif
- xfs_mount_free(mp);
- return 0;
-}
-
-void
-xfs_unmountfs_close(xfs_mount_t *mp, struct cred *cr)
-{
- if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
- xfs_free_buftarg(mp->m_logdev_targp, 1);
- if (mp->m_rtdev_targp)
- xfs_free_buftarg(mp->m_rtdev_targp, 1);
- xfs_free_buftarg(mp->m_ddev_targp, 0);
+ xfs_free_perag(mp);
+ if (mp->m_quotainfo)
+ XFS_QM_DONE(mp);
}
STATIC void
@@ -1905,16 +1844,6 @@ xfs_uuid_mount(
}
/*
- * Remove filesystem from the UUID table.
- */
-STATIC void
-xfs_uuid_unmount(
- xfs_mount_t *mp)
-{
- uuid_table_remove(&mp->m_sb.sb_uuid);
-}
-
-/*
* Used to log changes to the superblock unit and width fields which could
* be altered by the mount options, as well as any potential sb_features2
* fixup. Only the first superblock is updated.
@@ -1928,7 +1857,8 @@ xfs_mount_log_sb(
int error;
ASSERT(fields & (XFS_SB_UNIT | XFS_SB_WIDTH | XFS_SB_UUID |
- XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2));
+ XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2 |
+ XFS_SB_VERSIONNUM));
tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT);
error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
@@ -2109,7 +2039,7 @@ xfs_icsb_reinit_counters(
xfs_icsb_unlock(mp);
}
-STATIC void
+void
xfs_icsb_destroy_counters(
xfs_mount_t *mp)
{
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 63e0693a358..f3c1024b124 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -61,6 +61,7 @@ struct xfs_bmap_free;
struct xfs_extdelta;
struct xfs_swapext;
struct xfs_mru_cache;
+struct xfs_nameops;
/*
* Prototypes and functions for the Data Migration subsystem.
@@ -113,7 +114,7 @@ struct xfs_dqtrxops;
struct xfs_quotainfo;
typedef int (*xfs_qminit_t)(struct xfs_mount *, uint *, uint *);
-typedef int (*xfs_qmmount_t)(struct xfs_mount *, uint, uint, int);
+typedef int (*xfs_qmmount_t)(struct xfs_mount *, uint, uint);
typedef int (*xfs_qmunmount_t)(struct xfs_mount *);
typedef void (*xfs_qmdone_t)(struct xfs_mount *);
typedef void (*xfs_dqrele_t)(struct xfs_dquot *);
@@ -157,8 +158,8 @@ typedef struct xfs_qmops {
#define XFS_QM_INIT(mp, mnt, fl) \
(*(mp)->m_qm_ops->xfs_qminit)(mp, mnt, fl)
-#define XFS_QM_MOUNT(mp, mnt, fl, mfsi_flags) \
- (*(mp)->m_qm_ops->xfs_qmmount)(mp, mnt, fl, mfsi_flags)
+#define XFS_QM_MOUNT(mp, mnt, fl) \
+ (*(mp)->m_qm_ops->xfs_qmmount)(mp, mnt, fl)
#define XFS_QM_UNMOUNT(mp) \
(*(mp)->m_qm_ops->xfs_qmunmount)(mp)
#define XFS_QM_DONE(mp) \
@@ -210,12 +211,14 @@ typedef struct xfs_icsb_cnts {
extern int xfs_icsb_init_counters(struct xfs_mount *);
extern void xfs_icsb_reinit_counters(struct xfs_mount *);
+extern void xfs_icsb_destroy_counters(struct xfs_mount *);
extern void xfs_icsb_sync_counters(struct xfs_mount *, int);
extern void xfs_icsb_sync_counters_locked(struct xfs_mount *, int);
#else
-#define xfs_icsb_init_counters(mp) (0)
-#define xfs_icsb_reinit_counters(mp) do { } while (0)
+#define xfs_icsb_init_counters(mp) (0)
+#define xfs_icsb_destroy_counters(mp) do { } while (0)
+#define xfs_icsb_reinit_counters(mp) do { } while (0)
#define xfs_icsb_sync_counters(mp, flags) do { } while (0)
#define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0)
#endif
@@ -313,6 +316,7 @@ typedef struct xfs_mount {
__uint8_t m_inode_quiesce;/* call quiesce on new inodes.
field governed by m_ilock */
__uint8_t m_sectbb_log; /* sectlog - BBSHIFT */
+ const struct xfs_nameops *m_dirnameops; /* vector of dir name ops */
int m_dirblksize; /* directory block sz--bytes */
int m_dirblkfsbs; /* directory block sz--fsbs */
xfs_dablk_t m_dirdatablk; /* blockno of dir data v2 */
@@ -378,6 +382,7 @@ typedef struct xfs_mount {
counters */
#define XFS_MOUNT_FILESTREAMS (1ULL << 24) /* enable the filestreams
allocator */
+#define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */
/*
@@ -437,13 +442,6 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
/*
* Flags for xfs_mountfs
*/
-#define XFS_MFSI_SECOND 0x01 /* Secondary mount -- skip stuff */
-#define XFS_MFSI_CLIENT 0x02 /* Is a client -- skip lots of stuff */
-/* XFS_MFSI_RRINODES */
-#define XFS_MFSI_NOUNLINK 0x08 /* Skip unlinked inode processing in */
- /* log recovery */
-#define XFS_MFSI_NO_QUOTACHECK 0x10 /* Skip quotacheck processing */
-/* XFS_MFSI_CONVERT_SUNIT */
#define XFS_MFSI_QUIET 0x40 /* Be silent if mount errors found */
#define XFS_DADDR_TO_AGNO(mp,d) xfs_daddr_to_agno(mp,d)
@@ -510,15 +508,12 @@ typedef struct xfs_mod_sb {
#define XFS_MOUNT_ILOCK(mp) mutex_lock(&((mp)->m_ilock))
#define XFS_MOUNT_IUNLOCK(mp) mutex_unlock(&((mp)->m_ilock))
-extern xfs_mount_t *xfs_mount_init(void);
extern void xfs_mod_sb(xfs_trans_t *, __int64_t);
extern int xfs_log_sbcount(xfs_mount_t *, uint);
-extern void xfs_mount_free(xfs_mount_t *mp);
-extern int xfs_mountfs(xfs_mount_t *mp, int);
+extern int xfs_mountfs(xfs_mount_t *mp);
extern void xfs_mountfs_check_barriers(xfs_mount_t *mp);
-extern int xfs_unmountfs(xfs_mount_t *, struct cred *);
-extern void xfs_unmountfs_close(xfs_mount_t *, struct cred *);
+extern void xfs_unmountfs(xfs_mount_t *);
extern int xfs_unmountfs_writesb(xfs_mount_t *);
extern int xfs_unmount_flush(xfs_mount_t *, int);
extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
@@ -544,9 +539,6 @@ extern void xfs_qmops_put(struct xfs_mount *);
extern struct xfs_dmops xfs_dmcore_xfs;
-extern int xfs_init(void);
-extern void xfs_cleanup(void);
-
#endif /* __KERNEL__ */
#endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index a0b2c0a2589..afee7eb2432 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -307,15 +307,18 @@ xfs_mru_cache_init(void)
xfs_mru_elem_zone = kmem_zone_init(sizeof(xfs_mru_cache_elem_t),
"xfs_mru_cache_elem");
if (!xfs_mru_elem_zone)
- return ENOMEM;
+ goto out;
xfs_mru_reap_wq = create_singlethread_workqueue("xfs_mru_cache");
- if (!xfs_mru_reap_wq) {
- kmem_zone_destroy(xfs_mru_elem_zone);
- return ENOMEM;
- }
+ if (!xfs_mru_reap_wq)
+ goto out_destroy_mru_elem_zone;
return 0;
+
+ out_destroy_mru_elem_zone:
+ kmem_zone_destroy(xfs_mru_elem_zone);
+ out:
+ return -ENOMEM;
}
void
@@ -382,9 +385,9 @@ xfs_mru_cache_create(
exit:
if (err && mru && mru->lists)
- kmem_free(mru->lists, mru->grp_count * sizeof(*mru->lists));
+ kmem_free(mru->lists);
if (err && mru)
- kmem_free(mru, sizeof(*mru));
+ kmem_free(mru);
return err;
}
@@ -424,8 +427,8 @@ xfs_mru_cache_destroy(
xfs_mru_cache_flush(mru);
- kmem_free(mru->lists, mru->grp_count * sizeof(*mru->lists));
- kmem_free(mru, sizeof(*mru));
+ kmem_free(mru->lists);
+ kmem_free(mru);
}
/*
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index d8063e1ad29..d700dacdb10 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -336,22 +336,18 @@ xfs_rename(
ASSERT(error != EEXIST);
if (error)
goto abort_return;
- xfs_ichgtime(src_ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-
- } else {
- /*
- * We always want to hit the ctime on the source inode.
- * We do it in the if clause above for the 'new_parent &&
- * src_is_directory' case, and here we get all the other
- * cases. This isn't strictly required by the standards
- * since the source inode isn't really being changed,
- * but old unix file systems did it and some incremental
- * backup programs won't work without it.
- */
- xfs_ichgtime(src_ip, XFS_ICHGTIME_CHG);
}
/*
+ * We always want to hit the ctime on the source inode.
+ *
+ * This isn't strictly required by the standards since the source
+ * inode isn't really being changed, but old unix file systems did
+ * it and some incremental backup programs won't work without it.
+ */
+ xfs_ichgtime(src_ip, XFS_ICHGTIME_CHG);
+
+ /*
* Adjust the link count on src_dp. This is necessary when
* renaming a directory, either within one parent when
* the target existed, or across two parent directories.
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index a0dc6e5bc5b..e2f68de1615 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -74,18 +74,6 @@ STATIC int xfs_rtmodify_summary(xfs_mount_t *, xfs_trans_t *, int,
*/
/*
- * xfs_lowbit32: get low bit set out of 32-bit argument, -1 if none set.
- */
-STATIC int
-xfs_lowbit32(
- __uint32_t v)
-{
- if (v)
- return ffs(v) - 1;
- return -1;
-}
-
-/*
* Allocate space to the bitmap or summary file, and zero it, for growfs.
*/
STATIC int /* error */
@@ -450,6 +438,7 @@ xfs_rtallocate_extent_near(
}
bbno = XFS_BITTOBLOCK(mp, bno);
i = 0;
+ ASSERT(minlen != 0);
log2len = xfs_highbit32(minlen);
/*
* Loop over all bitmap blocks (bbno + i is current block).
@@ -618,6 +607,8 @@ xfs_rtallocate_extent_size(
xfs_suminfo_t sum; /* summary information for extents */
ASSERT(minlen % prod == 0 && maxlen % prod == 0);
+ ASSERT(maxlen != 0);
+
/*
* Loop over all the levels starting with maxlen.
* At each level, look at all the bitmap blocks, to see if there
@@ -675,6 +666,9 @@ xfs_rtallocate_extent_size(
*rtblock = NULLRTBLOCK;
return 0;
}
+ ASSERT(minlen != 0);
+ ASSERT(maxlen != 0);
+
/*
* Loop over sizes, from maxlen down to minlen.
* This time, when we do the allocations, allow smaller ones
@@ -1961,6 +1955,7 @@ xfs_growfs_rt(
nsbp->sb_blocksize * nsbp->sb_rextsize);
nsbp->sb_rextents = nsbp->sb_rblocks;
do_div(nsbp->sb_rextents, nsbp->sb_rextsize);
+ ASSERT(nsbp->sb_rextents != 0);
nsbp->sb_rextslog = xfs_highbit32(nsbp->sb_rextents);
nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1;
nrsumsize =
@@ -2062,7 +2057,7 @@ xfs_growfs_rt(
/*
* Free the fake mp structure.
*/
- kmem_free(nmp, sizeof(*nmp));
+ kmem_free(nmp);
return error;
}
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index b0f31c09a76..3a82576dde9 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -314,7 +314,7 @@ xfs_bioerror_relse(
* ASYNC buffers.
*/
XFS_BUF_ERROR(bp, EIO);
- XFS_BUF_V_IODONESEMA(bp);
+ XFS_BUF_FINISH_IOWAIT(bp);
} else {
xfs_buf_relse(bp);
}
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index d904efe7f87..3f8cf1587f4 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -46,10 +46,12 @@ struct xfs_mount;
#define XFS_SB_VERSION_SECTORBIT 0x0800
#define XFS_SB_VERSION_EXTFLGBIT 0x1000
#define XFS_SB_VERSION_DIRV2BIT 0x2000
+#define XFS_SB_VERSION_BORGBIT 0x4000 /* ASCII only case-insens. */
#define XFS_SB_VERSION_MOREBITSBIT 0x8000
#define XFS_SB_VERSION_OKSASHFBITS \
(XFS_SB_VERSION_EXTFLGBIT | \
- XFS_SB_VERSION_DIRV2BIT)
+ XFS_SB_VERSION_DIRV2BIT | \
+ XFS_SB_VERSION_BORGBIT)
#define XFS_SB_VERSION_OKREALFBITS \
(XFS_SB_VERSION_ATTRBIT | \
XFS_SB_VERSION_NLINKBIT | \
@@ -437,6 +439,12 @@ static inline int xfs_sb_version_hassector(xfs_sb_t *sbp)
((sbp)->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
}
+static inline int xfs_sb_version_hasasciici(xfs_sb_t *sbp)
+{
+ return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+ (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT);
+}
+
static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp)
{
return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
@@ -473,6 +481,13 @@ static inline void xfs_sb_version_addattr2(xfs_sb_t *sbp)
((sbp)->sb_features2 | XFS_SB_VERSION2_ATTR2BIT)));
}
+static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp)
+{
+ sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
+ if (!sbp->sb_features2)
+ sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT;
+}
+
/*
* end of superblock version macros
*/
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 140386434aa..4e1c22a23be 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -43,6 +43,7 @@
#include "xfs_quota.h"
#include "xfs_trans_priv.h"
#include "xfs_trans_space.h"
+#include "xfs_inode_item.h"
STATIC void xfs_trans_apply_sb_deltas(xfs_trans_t *);
@@ -253,7 +254,7 @@ _xfs_trans_alloc(
tp->t_mountp = mp;
tp->t_items_free = XFS_LIC_NUM_SLOTS;
tp->t_busy_free = XFS_LBC_NUM_SLOTS;
- XFS_LIC_INIT(&(tp->t_items));
+ xfs_lic_init(&(tp->t_items));
XFS_LBC_INIT(&(tp->t_busy));
return tp;
}
@@ -282,7 +283,7 @@ xfs_trans_dup(
ntp->t_mountp = tp->t_mountp;
ntp->t_items_free = XFS_LIC_NUM_SLOTS;
ntp->t_busy_free = XFS_LBC_NUM_SLOTS;
- XFS_LIC_INIT(&(ntp->t_items));
+ xfs_lic_init(&(ntp->t_items));
XFS_LBC_INIT(&(ntp->t_busy));
ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
@@ -889,7 +890,7 @@ shut_us_down:
tp->t_commit_lsn = commit_lsn;
if (nvec > XFS_TRANS_LOGVEC_COUNT) {
- kmem_free(log_vector, nvec * sizeof(xfs_log_iovec_t));
+ kmem_free(log_vector);
}
/*
@@ -1169,7 +1170,7 @@ xfs_trans_cancel(
while (licp != NULL) {
lidp = licp->lic_descs;
for (i = 0; i < licp->lic_unused; i++, lidp++) {
- if (XFS_LIC_ISFREE(licp, i)) {
+ if (xfs_lic_isfree(licp, i)) {
continue;
}
@@ -1216,6 +1217,68 @@ xfs_trans_free(
kmem_zone_free(xfs_trans_zone, tp);
}
+/*
+ * Roll from one trans in the sequence of PERMANENT transactions to
+ * the next: permanent transactions are only flushed out when
+ * committed with XFS_TRANS_RELEASE_LOG_RES, but we still want as soon
+ * as possible to let chunks of it go to the log. So we commit the
+ * chunk we've been working on and get a new transaction to continue.
+ */
+int
+xfs_trans_roll(
+ struct xfs_trans **tpp,
+ struct xfs_inode *dp)
+{
+ struct xfs_trans *trans;
+ unsigned int logres, count;
+ int error;
+
+ /*
+ * Ensure that the inode is always logged.
+ */
+ trans = *tpp;
+ xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
+
+ /*
+ * Copy the critical parameters from one trans to the next.
+ */
+ logres = trans->t_log_res;
+ count = trans->t_log_count;
+ *tpp = xfs_trans_dup(trans);
+
+ /*
+ * Commit the current transaction.
+ * If this commit failed, then it'd just unlock those items that
+ * are not marked ihold. That also means that a filesystem shutdown
+ * is in progress. The caller takes the responsibility to cancel
+ * the duplicate transaction that gets returned.
+ */
+ error = xfs_trans_commit(trans, 0);
+ if (error)
+ return (error);
+
+ trans = *tpp;
+
+ /*
+ * Reserve space in the log for th next transaction.
+ * This also pushes items in the "AIL", the list of logged items,
+ * out to disk if they are taking up space at the tail of the log
+ * that we want to use. This requires that either nothing be locked
+ * across this call, or that anything that is locked be logged in
+ * the prior and the next transactions.
+ */
+ error = xfs_trans_reserve(trans, 0, logres, 0,
+ XFS_TRANS_PERM_LOG_RES, count);
+ /*
+ * Ensure that the inode is in the new transaction and locked.
+ */
+ if (error)
+ return error;
+
+ xfs_trans_ijoin(trans, dp, XFS_ILOCK_EXCL);
+ xfs_trans_ihold(trans, dp);
+ return 0;
+}
/*
* THIS SHOULD BE REWRITTEN TO USE xfs_trans_next_item().
@@ -1253,7 +1316,7 @@ xfs_trans_committed(
* Special case the chunk embedded in the transaction.
*/
licp = &(tp->t_items);
- if (!(XFS_LIC_ARE_ALL_FREE(licp))) {
+ if (!(xfs_lic_are_all_free(licp))) {
xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag);
}
@@ -1262,10 +1325,10 @@ xfs_trans_committed(
*/
licp = licp->lic_next;
while (licp != NULL) {
- ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
+ ASSERT(!xfs_lic_are_all_free(licp));
xfs_trans_chunk_committed(licp, tp->t_lsn, abortflag);
next_licp = licp->lic_next;
- kmem_free(licp, sizeof(xfs_log_item_chunk_t));
+ kmem_free(licp);
licp = next_licp;
}
@@ -1325,7 +1388,7 @@ xfs_trans_chunk_committed(
lidp = licp->lic_descs;
for (i = 0; i < licp->lic_unused; i++, lidp++) {
- if (XFS_LIC_ISFREE(licp, i)) {
+ if (xfs_lic_isfree(licp, i)) {
continue;
}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 0804207c739..74c80bd2b0e 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -210,62 +210,52 @@ typedef struct xfs_log_item_chunk {
* lic_unused to the right value (0 matches all free). The
* lic_descs.lid_index values are set up as each desc is allocated.
*/
-#define XFS_LIC_INIT(cp) xfs_lic_init(cp)
static inline void xfs_lic_init(xfs_log_item_chunk_t *cp)
{
cp->lic_free = XFS_LIC_FREEMASK;
}
-#define XFS_LIC_INIT_SLOT(cp,slot) xfs_lic_init_slot(cp, slot)
static inline void xfs_lic_init_slot(xfs_log_item_chunk_t *cp, int slot)
{
cp->lic_descs[slot].lid_index = (unsigned char)(slot);
}
-#define XFS_LIC_VACANCY(cp) xfs_lic_vacancy(cp)
static inline int xfs_lic_vacancy(xfs_log_item_chunk_t *cp)
{
return cp->lic_free & XFS_LIC_FREEMASK;
}
-#define XFS_LIC_ALL_FREE(cp) xfs_lic_all_free(cp)
static inline void xfs_lic_all_free(xfs_log_item_chunk_t *cp)
{
cp->lic_free = XFS_LIC_FREEMASK;
}
-#define XFS_LIC_ARE_ALL_FREE(cp) xfs_lic_are_all_free(cp)
static inline int xfs_lic_are_all_free(xfs_log_item_chunk_t *cp)
{
return ((cp->lic_free & XFS_LIC_FREEMASK) == XFS_LIC_FREEMASK);
}
-#define XFS_LIC_ISFREE(cp,slot) xfs_lic_isfree(cp,slot)
static inline int xfs_lic_isfree(xfs_log_item_chunk_t *cp, int slot)
{
return (cp->lic_free & (1 << slot));
}
-#define XFS_LIC_CLAIM(cp,slot) xfs_lic_claim(cp,slot)
static inline void xfs_lic_claim(xfs_log_item_chunk_t *cp, int slot)
{
cp->lic_free &= ~(1 << slot);
}
-#define XFS_LIC_RELSE(cp,slot) xfs_lic_relse(cp,slot)
static inline void xfs_lic_relse(xfs_log_item_chunk_t *cp, int slot)
{
cp->lic_free |= 1 << slot;
}
-#define XFS_LIC_SLOT(cp,slot) xfs_lic_slot(cp,slot)
static inline xfs_log_item_desc_t *
xfs_lic_slot(xfs_log_item_chunk_t *cp, int slot)
{
return &(cp->lic_descs[slot]);
}
-#define XFS_LIC_DESC_TO_SLOT(dp) xfs_lic_desc_to_slot(dp)
static inline int xfs_lic_desc_to_slot(xfs_log_item_desc_t *dp)
{
return (uint)dp->lid_index;
@@ -278,7 +268,6 @@ static inline int xfs_lic_desc_to_slot(xfs_log_item_desc_t *dp)
* All of this yields the address of the chunk, which is
* cast to a chunk pointer.
*/
-#define XFS_LIC_DESC_TO_CHUNK(dp) xfs_lic_desc_to_chunk(dp)
static inline xfs_log_item_chunk_t *
xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
{
@@ -986,6 +975,7 @@ int _xfs_trans_commit(xfs_trans_t *,
int *);
#define xfs_trans_commit(tp, flags) _xfs_trans_commit(tp, flags, NULL)
void xfs_trans_cancel(xfs_trans_t *, int);
+int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
int xfs_trans_ail_init(struct xfs_mount *);
void xfs_trans_ail_destroy(struct xfs_mount *);
void xfs_trans_push_ail(struct xfs_mount *, xfs_lsn_t);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index cb0c5839154..4e855b5ced6 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -1021,16 +1021,16 @@ xfs_trans_buf_item_match(
bp = NULL;
len = BBTOB(len);
licp = &tp->t_items;
- if (!XFS_LIC_ARE_ALL_FREE(licp)) {
+ if (!xfs_lic_are_all_free(licp)) {
for (i = 0; i < licp->lic_unused; i++) {
/*
* Skip unoccupied slots.
*/
- if (XFS_LIC_ISFREE(licp, i)) {
+ if (xfs_lic_isfree(licp, i)) {
continue;
}
- lidp = XFS_LIC_SLOT(licp, i);
+ lidp = xfs_lic_slot(licp, i);
blip = (xfs_buf_log_item_t *)lidp->lid_item;
if (blip->bli_item.li_type != XFS_LI_BUF) {
continue;
@@ -1074,7 +1074,7 @@ xfs_trans_buf_item_match_all(
bp = NULL;
len = BBTOB(len);
for (licp = &tp->t_items; licp != NULL; licp = licp->lic_next) {
- if (XFS_LIC_ARE_ALL_FREE(licp)) {
+ if (xfs_lic_are_all_free(licp)) {
ASSERT(licp == &tp->t_items);
ASSERT(licp->lic_next == NULL);
return NULL;
@@ -1083,11 +1083,11 @@ xfs_trans_buf_item_match_all(
/*
* Skip unoccupied slots.
*/
- if (XFS_LIC_ISFREE(licp, i)) {
+ if (xfs_lic_isfree(licp, i)) {
continue;
}
- lidp = XFS_LIC_SLOT(licp, i);
+ lidp = xfs_lic_slot(licp, i);
blip = (xfs_buf_log_item_t *)lidp->lid_item;
if (blip->bli_item.li_type != XFS_LI_BUF) {
continue;
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 4c70bf5e998..2a1c0f071f9 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -291,7 +291,7 @@ xfs_trans_inode_broot_debug(
iip = ip->i_itemp;
if (iip->ili_root_size != 0) {
ASSERT(iip->ili_orig_root != NULL);
- kmem_free(iip->ili_orig_root, iip->ili_root_size);
+ kmem_free(iip->ili_orig_root);
iip->ili_root_size = 0;
iip->ili_orig_root = NULL;
}
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
index 66a09f0d894..3c666e8317f 100644
--- a/fs/xfs/xfs_trans_item.c
+++ b/fs/xfs/xfs_trans_item.c
@@ -53,11 +53,11 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
* Initialize the chunk, and then
* claim the first slot in the newly allocated chunk.
*/
- XFS_LIC_INIT(licp);
- XFS_LIC_CLAIM(licp, 0);
+ xfs_lic_init(licp);
+ xfs_lic_claim(licp, 0);
licp->lic_unused = 1;
- XFS_LIC_INIT_SLOT(licp, 0);
- lidp = XFS_LIC_SLOT(licp, 0);
+ xfs_lic_init_slot(licp, 0);
+ lidp = xfs_lic_slot(licp, 0);
/*
* Link in the new chunk and update the free count.
@@ -88,14 +88,14 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
*/
licp = &tp->t_items;
while (licp != NULL) {
- if (XFS_LIC_VACANCY(licp)) {
+ if (xfs_lic_vacancy(licp)) {
if (licp->lic_unused <= XFS_LIC_MAX_SLOT) {
i = licp->lic_unused;
- ASSERT(XFS_LIC_ISFREE(licp, i));
+ ASSERT(xfs_lic_isfree(licp, i));
break;
}
for (i = 0; i <= XFS_LIC_MAX_SLOT; i++) {
- if (XFS_LIC_ISFREE(licp, i))
+ if (xfs_lic_isfree(licp, i))
break;
}
ASSERT(i <= XFS_LIC_MAX_SLOT);
@@ -108,12 +108,12 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
* If we find a free descriptor, claim it,
* initialize it, and return it.
*/
- XFS_LIC_CLAIM(licp, i);
+ xfs_lic_claim(licp, i);
if (licp->lic_unused <= i) {
licp->lic_unused = i + 1;
- XFS_LIC_INIT_SLOT(licp, i);
+ xfs_lic_init_slot(licp, i);
}
- lidp = XFS_LIC_SLOT(licp, i);
+ lidp = xfs_lic_slot(licp, i);
tp->t_items_free--;
lidp->lid_item = lip;
lidp->lid_flags = 0;
@@ -136,9 +136,9 @@ xfs_trans_free_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
xfs_log_item_chunk_t *licp;
xfs_log_item_chunk_t **licpp;
- slot = XFS_LIC_DESC_TO_SLOT(lidp);
- licp = XFS_LIC_DESC_TO_CHUNK(lidp);
- XFS_LIC_RELSE(licp, slot);
+ slot = xfs_lic_desc_to_slot(lidp);
+ licp = xfs_lic_desc_to_chunk(lidp);
+ xfs_lic_relse(licp, slot);
lidp->lid_item->li_desc = NULL;
tp->t_items_free++;
@@ -154,14 +154,14 @@ xfs_trans_free_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
* Also decrement the transaction structure's count of free items
* by the number in a chunk since we are freeing an empty chunk.
*/
- if (XFS_LIC_ARE_ALL_FREE(licp) && (licp != &(tp->t_items))) {
+ if (xfs_lic_are_all_free(licp) && (licp != &(tp->t_items))) {
licpp = &(tp->t_items.lic_next);
while (*licpp != licp) {
ASSERT(*licpp != NULL);
licpp = &((*licpp)->lic_next);
}
*licpp = licp->lic_next;
- kmem_free(licp, sizeof(xfs_log_item_chunk_t));
+ kmem_free(licp);
tp->t_items_free -= XFS_LIC_NUM_SLOTS;
}
}
@@ -207,20 +207,20 @@ xfs_trans_first_item(xfs_trans_t *tp)
/*
* If it's not in the first chunk, skip to the second.
*/
- if (XFS_LIC_ARE_ALL_FREE(licp)) {
+ if (xfs_lic_are_all_free(licp)) {
licp = licp->lic_next;
}
/*
* Return the first non-free descriptor in the chunk.
*/
- ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
+ ASSERT(!xfs_lic_are_all_free(licp));
for (i = 0; i < licp->lic_unused; i++) {
- if (XFS_LIC_ISFREE(licp, i)) {
+ if (xfs_lic_isfree(licp, i)) {
continue;
}
- return XFS_LIC_SLOT(licp, i);
+ return xfs_lic_slot(licp, i);
}
cmn_err(CE_WARN, "xfs_trans_first_item() -- no first item");
return NULL;
@@ -242,18 +242,18 @@ xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
xfs_log_item_chunk_t *licp;
int i;
- licp = XFS_LIC_DESC_TO_CHUNK(lidp);
+ licp = xfs_lic_desc_to_chunk(lidp);
/*
* First search the rest of the chunk. The for loop keeps us
* from referencing things beyond the end of the chunk.
*/
- for (i = (int)XFS_LIC_DESC_TO_SLOT(lidp) + 1; i < licp->lic_unused; i++) {
- if (XFS_LIC_ISFREE(licp, i)) {
+ for (i = (int)xfs_lic_desc_to_slot(lidp) + 1; i < licp->lic_unused; i++) {
+ if (xfs_lic_isfree(licp, i)) {
continue;
}
- return XFS_LIC_SLOT(licp, i);
+ return xfs_lic_slot(licp, i);
}
/*
@@ -266,13 +266,13 @@ xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
}
licp = licp->lic_next;
- ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
+ ASSERT(!xfs_lic_are_all_free(licp));
for (i = 0; i < licp->lic_unused; i++) {
- if (XFS_LIC_ISFREE(licp, i)) {
+ if (xfs_lic_isfree(licp, i)) {
continue;
}
- return XFS_LIC_SLOT(licp, i);
+ return xfs_lic_slot(licp, i);
}
ASSERT(0);
/* NOTREACHED */
@@ -300,9 +300,9 @@ xfs_trans_free_items(
/*
* Special case the embedded chunk so we don't free it below.
*/
- if (!XFS_LIC_ARE_ALL_FREE(licp)) {
+ if (!xfs_lic_are_all_free(licp)) {
(void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN);
- XFS_LIC_ALL_FREE(licp);
+ xfs_lic_all_free(licp);
licp->lic_unused = 0;
}
licp = licp->lic_next;
@@ -311,10 +311,10 @@ xfs_trans_free_items(
* Unlock each item in each chunk and free the chunks.
*/
while (licp != NULL) {
- ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
+ ASSERT(!xfs_lic_are_all_free(licp));
(void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN);
next_licp = licp->lic_next;
- kmem_free(licp, sizeof(xfs_log_item_chunk_t));
+ kmem_free(licp);
licp = next_licp;
}
@@ -347,7 +347,7 @@ xfs_trans_unlock_items(xfs_trans_t *tp, xfs_lsn_t commit_lsn)
/*
* Special case the embedded chunk so we don't free.
*/
- if (!XFS_LIC_ARE_ALL_FREE(licp)) {
+ if (!xfs_lic_are_all_free(licp)) {
freed = xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn);
}
licpp = &(tp->t_items.lic_next);
@@ -358,12 +358,12 @@ xfs_trans_unlock_items(xfs_trans_t *tp, xfs_lsn_t commit_lsn)
* and free empty chunks.
*/
while (licp != NULL) {
- ASSERT(!XFS_LIC_ARE_ALL_FREE(licp));
+ ASSERT(!xfs_lic_are_all_free(licp));
freed += xfs_trans_unlock_chunk(licp, 0, 0, commit_lsn);
next_licp = licp->lic_next;
- if (XFS_LIC_ARE_ALL_FREE(licp)) {
+ if (xfs_lic_are_all_free(licp)) {
*licpp = next_licp;
- kmem_free(licp, sizeof(xfs_log_item_chunk_t));
+ kmem_free(licp);
freed -= XFS_LIC_NUM_SLOTS;
} else {
licpp = &(licp->lic_next);
@@ -402,7 +402,7 @@ xfs_trans_unlock_chunk(
freed = 0;
lidp = licp->lic_descs;
for (i = 0; i < licp->lic_unused; i++, lidp++) {
- if (XFS_LIC_ISFREE(licp, i)) {
+ if (xfs_lic_isfree(licp, i)) {
continue;
}
lip = lidp->lid_item;
@@ -421,7 +421,7 @@ xfs_trans_unlock_chunk(
*/
if (!(freeing_chunk) &&
(!(lidp->lid_flags & XFS_LID_DIRTY) || abort)) {
- XFS_LIC_RELSE(licp, i);
+ xfs_lic_relse(licp, i);
freed++;
}
}
@@ -530,7 +530,7 @@ xfs_trans_free_busy(xfs_trans_t *tp)
lbcp = tp->t_busy.lbc_next;
while (lbcp != NULL) {
lbcq = lbcp->lbc_next;
- kmem_free(lbcp, sizeof(xfs_log_busy_chunk_t));
+ kmem_free(lbcp);
lbcp = lbcq;
}
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 98e5f110ba5..35d4d414bcc 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -237,7 +237,7 @@ xfs_droplink(
ASSERT (ip->i_d.di_nlink > 0);
ip->i_d.di_nlink--;
- drop_nlink(ip->i_vnode);
+ drop_nlink(VFS_I(ip));
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
error = 0;
@@ -301,7 +301,7 @@ xfs_bumplink(
ASSERT(ip->i_d.di_nlink > 0);
ip->i_d.di_nlink++;
- inc_nlink(ip->i_vnode);
+ inc_nlink(VFS_I(ip));
if ((ip->i_d.di_version == XFS_DINODE_VERSION_1) &&
(ip->i_d.di_nlink > XFS_MAXLINK_1)) {
/*
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
index f316cb85d8e..ef321225d26 100644
--- a/fs/xfs/xfs_utils.h
+++ b/fs/xfs/xfs_utils.h
@@ -18,9 +18,6 @@
#ifndef __XFS_UTILS_H__
#define __XFS_UTILS_H__
-#define IRELE(ip) VN_RELE(XFS_ITOV(ip))
-#define IHOLD(ip) VN_HOLD(XFS_ITOV(ip))
-
extern int xfs_truncate_file(xfs_mount_t *, xfs_inode_t *);
extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
xfs_dev_t, cred_t *, prid_t, int,
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index 30bacd8bb0e..439dd3939dd 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -58,586 +58,6 @@
#include "xfs_utils.h"
-int __init
-xfs_init(void)
-{
-#ifdef XFS_DABUF_DEBUG
- extern spinlock_t xfs_dabuf_global_lock;
- spin_lock_init(&xfs_dabuf_global_lock);
-#endif
-
- /*
- * Initialize all of the zone allocators we use.
- */
- xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t),
- "xfs_log_ticket");
- xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t),
- "xfs_bmap_free_item");
- xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t),
- "xfs_btree_cur");
- xfs_da_state_zone = kmem_zone_init(sizeof(xfs_da_state_t),
- "xfs_da_state");
- xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf");
- xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork");
- xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans");
- xfs_acl_zone_init(xfs_acl_zone, "xfs_acl");
- xfs_mru_cache_init();
- xfs_filestream_init();
-
- /*
- * The size of the zone allocated buf log item is the maximum
- * size possible under XFS. This wastes a little bit of memory,
- * but it is much faster.
- */
- xfs_buf_item_zone =
- kmem_zone_init((sizeof(xfs_buf_log_item_t) +
- (((XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK) /
- NBWORD) * sizeof(int))),
- "xfs_buf_item");
- xfs_efd_zone =
- kmem_zone_init((sizeof(xfs_efd_log_item_t) +
- ((XFS_EFD_MAX_FAST_EXTENTS - 1) *
- sizeof(xfs_extent_t))),
- "xfs_efd_item");
- xfs_efi_zone =
- kmem_zone_init((sizeof(xfs_efi_log_item_t) +
- ((XFS_EFI_MAX_FAST_EXTENTS - 1) *
- sizeof(xfs_extent_t))),
- "xfs_efi_item");
-
- /*
- * These zones warrant special memory allocator hints
- */
- xfs_inode_zone =
- kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode",
- KM_ZONE_HWALIGN | KM_ZONE_RECLAIM |
- KM_ZONE_SPREAD, NULL);
- xfs_ili_zone =
- kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili",
- KM_ZONE_SPREAD, NULL);
-
- /*
- * Allocate global trace buffers.
- */
-#ifdef XFS_ALLOC_TRACE
- xfs_alloc_trace_buf = ktrace_alloc(XFS_ALLOC_TRACE_SIZE, KM_SLEEP);
-#endif
-#ifdef XFS_BMAP_TRACE
- xfs_bmap_trace_buf = ktrace_alloc(XFS_BMAP_TRACE_SIZE, KM_SLEEP);
-#endif
-#ifdef XFS_BMBT_TRACE
- xfs_bmbt_trace_buf = ktrace_alloc(XFS_BMBT_TRACE_SIZE, KM_SLEEP);
-#endif
-#ifdef XFS_ATTR_TRACE
- xfs_attr_trace_buf = ktrace_alloc(XFS_ATTR_TRACE_SIZE, KM_SLEEP);
-#endif
-#ifdef XFS_DIR2_TRACE
- xfs_dir2_trace_buf = ktrace_alloc(XFS_DIR2_GTRACE_SIZE, KM_SLEEP);
-#endif
-
- xfs_dir_startup();
-
-#if (defined(DEBUG) || defined(INDUCE_IO_ERROR))
- xfs_error_test_init();
-#endif /* DEBUG || INDUCE_IO_ERROR */
-
- xfs_init_procfs();
- xfs_sysctl_register();
- return 0;
-}
-
-void __exit
-xfs_cleanup(void)
-{
- extern kmem_zone_t *xfs_inode_zone;
- extern kmem_zone_t *xfs_efd_zone;
- extern kmem_zone_t *xfs_efi_zone;
-
- xfs_cleanup_procfs();
- xfs_sysctl_unregister();
- xfs_filestream_uninit();
- xfs_mru_cache_uninit();
- xfs_acl_zone_destroy(xfs_acl_zone);
-
-#ifdef XFS_DIR2_TRACE
- ktrace_free(xfs_dir2_trace_buf);
-#endif
-#ifdef XFS_ATTR_TRACE
- ktrace_free(xfs_attr_trace_buf);
-#endif
-#ifdef XFS_BMBT_TRACE
- ktrace_free(xfs_bmbt_trace_buf);
-#endif
-#ifdef XFS_BMAP_TRACE
- ktrace_free(xfs_bmap_trace_buf);
-#endif
-#ifdef XFS_ALLOC_TRACE
- ktrace_free(xfs_alloc_trace_buf);
-#endif
-
- kmem_zone_destroy(xfs_bmap_free_item_zone);
- kmem_zone_destroy(xfs_btree_cur_zone);
- kmem_zone_destroy(xfs_inode_zone);
- kmem_zone_destroy(xfs_trans_zone);
- kmem_zone_destroy(xfs_da_state_zone);
- kmem_zone_destroy(xfs_dabuf_zone);
- kmem_zone_destroy(xfs_buf_item_zone);
- kmem_zone_destroy(xfs_efd_zone);
- kmem_zone_destroy(xfs_efi_zone);
- kmem_zone_destroy(xfs_ifork_zone);
- kmem_zone_destroy(xfs_ili_zone);
- kmem_zone_destroy(xfs_log_ticket_zone);
-}
-
-/*
- * xfs_start_flags
- *
- * This function fills in xfs_mount_t fields based on mount args.
- * Note: the superblock has _not_ yet been read in.
- */
-STATIC int
-xfs_start_flags(
- struct xfs_mount_args *ap,
- struct xfs_mount *mp)
-{
- /* Values are in BBs */
- if ((ap->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) {
- /*
- * At this point the superblock has not been read
- * in, therefore we do not know the block size.
- * Before the mount call ends we will convert
- * these to FSBs.
- */
- mp->m_dalign = ap->sunit;
- mp->m_swidth = ap->swidth;
- }
-
- if (ap->logbufs != -1 &&
- ap->logbufs != 0 &&
- (ap->logbufs < XLOG_MIN_ICLOGS ||
- ap->logbufs > XLOG_MAX_ICLOGS)) {
- cmn_err(CE_WARN,
- "XFS: invalid logbufs value: %d [not %d-%d]",
- ap->logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
- return XFS_ERROR(EINVAL);
- }
- mp->m_logbufs = ap->logbufs;
- if (ap->logbufsize != -1 &&
- ap->logbufsize != 0 &&
- (ap->logbufsize < XLOG_MIN_RECORD_BSIZE ||
- ap->logbufsize > XLOG_MAX_RECORD_BSIZE ||
- !is_power_of_2(ap->logbufsize))) {
- cmn_err(CE_WARN,
- "XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
- ap->logbufsize);
- return XFS_ERROR(EINVAL);
- }
- mp->m_logbsize = ap->logbufsize;
- mp->m_fsname_len = strlen(ap->fsname) + 1;
- mp->m_fsname = kmem_alloc(mp->m_fsname_len, KM_SLEEP);
- strcpy(mp->m_fsname, ap->fsname);
- if (ap->rtname[0]) {
- mp->m_rtname = kmem_alloc(strlen(ap->rtname) + 1, KM_SLEEP);
- strcpy(mp->m_rtname, ap->rtname);
- }
- if (ap->logname[0]) {
- mp->m_logname = kmem_alloc(strlen(ap->logname) + 1, KM_SLEEP);
- strcpy(mp->m_logname, ap->logname);
- }
-
- if (ap->flags & XFSMNT_WSYNC)
- mp->m_flags |= XFS_MOUNT_WSYNC;
-#if XFS_BIG_INUMS
- if (ap->flags & XFSMNT_INO64) {
- mp->m_flags |= XFS_MOUNT_INO64;
- mp->m_inoadd = XFS_INO64_OFFSET;
- }
-#endif
- if (ap->flags & XFSMNT_RETERR)
- mp->m_flags |= XFS_MOUNT_RETERR;
- if (ap->flags & XFSMNT_NOALIGN)
- mp->m_flags |= XFS_MOUNT_NOALIGN;
- if (ap->flags & XFSMNT_SWALLOC)
- mp->m_flags |= XFS_MOUNT_SWALLOC;
- if (ap->flags & XFSMNT_OSYNCISOSYNC)
- mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC;
- if (ap->flags & XFSMNT_32BITINODES)
- mp->m_flags |= XFS_MOUNT_32BITINODES;
-
- if (ap->flags & XFSMNT_IOSIZE) {
- if (ap->iosizelog > XFS_MAX_IO_LOG ||
- ap->iosizelog < XFS_MIN_IO_LOG) {
- cmn_err(CE_WARN,
- "XFS: invalid log iosize: %d [not %d-%d]",
- ap->iosizelog, XFS_MIN_IO_LOG,
- XFS_MAX_IO_LOG);
- return XFS_ERROR(EINVAL);
- }
-
- mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE;
- mp->m_readio_log = mp->m_writeio_log = ap->iosizelog;
- }
-
- if (ap->flags & XFSMNT_IKEEP)
- mp->m_flags |= XFS_MOUNT_IKEEP;
- if (ap->flags & XFSMNT_DIRSYNC)
- mp->m_flags |= XFS_MOUNT_DIRSYNC;
- if (ap->flags & XFSMNT_ATTR2)
- mp->m_flags |= XFS_MOUNT_ATTR2;
-
- if (ap->flags2 & XFSMNT2_COMPAT_IOSIZE)
- mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
-
- /*
- * no recovery flag requires a read-only mount
- */
- if (ap->flags & XFSMNT_NORECOVERY) {
- if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
- cmn_err(CE_WARN,
- "XFS: tried to mount a FS read-write without recovery!");
- return XFS_ERROR(EINVAL);
- }
- mp->m_flags |= XFS_MOUNT_NORECOVERY;
- }
-
- if (ap->flags & XFSMNT_NOUUID)
- mp->m_flags |= XFS_MOUNT_NOUUID;
- if (ap->flags & XFSMNT_BARRIER)
- mp->m_flags |= XFS_MOUNT_BARRIER;
- else
- mp->m_flags &= ~XFS_MOUNT_BARRIER;
-
- if (ap->flags2 & XFSMNT2_FILESTREAMS)
- mp->m_flags |= XFS_MOUNT_FILESTREAMS;
-
- if (ap->flags & XFSMNT_DMAPI)
- mp->m_flags |= XFS_MOUNT_DMAPI;
- return 0;
-}
-
-/*
- * This function fills in xfs_mount_t fields based on mount args.
- * Note: the superblock _has_ now been read in.
- */
-STATIC int
-xfs_finish_flags(
- struct xfs_mount_args *ap,
- struct xfs_mount *mp)
-{
- int ronly = (mp->m_flags & XFS_MOUNT_RDONLY);
-
- /* Fail a mount where the logbuf is smaller then the log stripe */
- if (xfs_sb_version_haslogv2(&mp->m_sb)) {
- if ((ap->logbufsize <= 0) &&
- (mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE)) {
- mp->m_logbsize = mp->m_sb.sb_logsunit;
- } else if (ap->logbufsize > 0 &&
- ap->logbufsize < mp->m_sb.sb_logsunit) {
- cmn_err(CE_WARN,
- "XFS: logbuf size must be greater than or equal to log stripe size");
- return XFS_ERROR(EINVAL);
- }
- } else {
- /* Fail a mount if the logbuf is larger than 32K */
- if (ap->logbufsize > XLOG_BIG_RECORD_BSIZE) {
- cmn_err(CE_WARN,
- "XFS: logbuf size for version 1 logs must be 16K or 32K");
- return XFS_ERROR(EINVAL);
- }
- }
-
- if (xfs_sb_version_hasattr2(&mp->m_sb))
- mp->m_flags |= XFS_MOUNT_ATTR2;
-
- /*
- * prohibit r/w mounts of read-only filesystems
- */
- if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) {
- cmn_err(CE_WARN,
- "XFS: cannot mount a read-only filesystem as read-write");
- return XFS_ERROR(EROFS);
- }
-
- /*
- * check for shared mount.
- */
- if (ap->flags & XFSMNT_SHARED) {
- if (!xfs_sb_version_hasshared(&mp->m_sb))
- return XFS_ERROR(EINVAL);
-
- /*
- * For IRIX 6.5, shared mounts must have the shared
- * version bit set, have the persistent readonly
- * field set, must be version 0 and can only be mounted
- * read-only.
- */
- if (!ronly || !(mp->m_sb.sb_flags & XFS_SBF_READONLY) ||
- (mp->m_sb.sb_shared_vn != 0))
- return XFS_ERROR(EINVAL);
-
- mp->m_flags |= XFS_MOUNT_SHARED;
-
- /*
- * Shared XFS V0 can't deal with DMI. Return EINVAL.
- */
- if (mp->m_sb.sb_shared_vn == 0 && (ap->flags & XFSMNT_DMAPI))
- return XFS_ERROR(EINVAL);
- }
-
- if (ap->flags & XFSMNT_UQUOTA) {
- mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
- if (ap->flags & XFSMNT_UQUOTAENF)
- mp->m_qflags |= XFS_UQUOTA_ENFD;
- }
-
- if (ap->flags & XFSMNT_GQUOTA) {
- mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
- if (ap->flags & XFSMNT_GQUOTAENF)
- mp->m_qflags |= XFS_OQUOTA_ENFD;
- } else if (ap->flags & XFSMNT_PQUOTA) {
- mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
- if (ap->flags & XFSMNT_PQUOTAENF)
- mp->m_qflags |= XFS_OQUOTA_ENFD;
- }
-
- return 0;
-}
-
-/*
- * xfs_mount
- *
- * The file system configurations are:
- * (1) device (partition) with data and internal log
- * (2) logical volume with data and log subvolumes.
- * (3) logical volume with data, log, and realtime subvolumes.
- *
- * We only have to handle opening the log and realtime volumes here if
- * they are present. The data subvolume has already been opened by
- * get_sb_bdev() and is stored in vfsp->vfs_super->s_bdev.
- */
-int
-xfs_mount(
- struct xfs_mount *mp,
- struct xfs_mount_args *args,
- cred_t *credp)
-{
- struct block_device *ddev, *logdev, *rtdev;
- int flags = 0, error;
-
- ddev = mp->m_super->s_bdev;
- logdev = rtdev = NULL;
-
- error = xfs_dmops_get(mp, args);
- if (error)
- return error;
- error = xfs_qmops_get(mp, args);
- if (error)
- return error;
-
- if (args->flags & XFSMNT_QUIET)
- flags |= XFS_MFSI_QUIET;
-
- /*
- * Open real time and log devices - order is important.
- */
- if (args->logname[0]) {
- error = xfs_blkdev_get(mp, args->logname, &logdev);
- if (error)
- return error;
- }
- if (args->rtname[0]) {
- error = xfs_blkdev_get(mp, args->rtname, &rtdev);
- if (error) {
- xfs_blkdev_put(logdev);
- return error;
- }
-
- if (rtdev == ddev || rtdev == logdev) {
- cmn_err(CE_WARN,
- "XFS: Cannot mount filesystem with identical rtdev and ddev/logdev.");
- xfs_blkdev_put(logdev);
- xfs_blkdev_put(rtdev);
- return EINVAL;
- }
- }
-
- /*
- * Setup xfs_mount buffer target pointers
- */
- error = ENOMEM;
- mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0);
- if (!mp->m_ddev_targp) {
- xfs_blkdev_put(logdev);
- xfs_blkdev_put(rtdev);
- return error;
- }
- if (rtdev) {
- mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1);
- if (!mp->m_rtdev_targp) {
- xfs_blkdev_put(logdev);
- xfs_blkdev_put(rtdev);
- goto error0;
- }
- }
- mp->m_logdev_targp = (logdev && logdev != ddev) ?
- xfs_alloc_buftarg(logdev, 1) : mp->m_ddev_targp;
- if (!mp->m_logdev_targp) {
- xfs_blkdev_put(logdev);
- xfs_blkdev_put(rtdev);
- goto error0;
- }
-
- /*
- * Setup flags based on mount(2) options and then the superblock
- */
- error = xfs_start_flags(args, mp);
- if (error)
- goto error1;
- error = xfs_readsb(mp, flags);
- if (error)
- goto error1;
- error = xfs_finish_flags(args, mp);
- if (error)
- goto error2;
-
- /*
- * Setup xfs_mount buffer target pointers based on superblock
- */
- error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_blocksize,
- mp->m_sb.sb_sectsize);
- if (!error && logdev && logdev != ddev) {
- unsigned int log_sector_size = BBSIZE;
-
- if (xfs_sb_version_hassector(&mp->m_sb))
- log_sector_size = mp->m_sb.sb_logsectsize;
- error = xfs_setsize_buftarg(mp->m_logdev_targp,
- mp->m_sb.sb_blocksize,
- log_sector_size);
- }
- if (!error && rtdev)
- error = xfs_setsize_buftarg(mp->m_rtdev_targp,
- mp->m_sb.sb_blocksize,
- mp->m_sb.sb_sectsize);
- if (error)
- goto error2;
-
- if (mp->m_flags & XFS_MOUNT_BARRIER)
- xfs_mountfs_check_barriers(mp);
-
- if ((error = xfs_filestream_mount(mp)))
- goto error2;
-
- error = xfs_mountfs(mp, flags);
- if (error)
- goto error2;
-
- XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, args->mtpt, args->fsname);
-
- return 0;
-
-error2:
- if (mp->m_sb_bp)
- xfs_freesb(mp);
-error1:
- xfs_binval(mp->m_ddev_targp);
- if (logdev && logdev != ddev)
- xfs_binval(mp->m_logdev_targp);
- if (rtdev)
- xfs_binval(mp->m_rtdev_targp);
-error0:
- xfs_unmountfs_close(mp, credp);
- xfs_qmops_put(mp);
- xfs_dmops_put(mp);
- return error;
-}
-
-int
-xfs_unmount(
- xfs_mount_t *mp,
- int flags,
- cred_t *credp)
-{
- xfs_inode_t *rip;
- bhv_vnode_t *rvp;
- int unmount_event_wanted = 0;
- int unmount_event_flags = 0;
- int xfs_unmountfs_needed = 0;
- int error;
-
- rip = mp->m_rootip;
- rvp = XFS_ITOV(rip);
-
-#ifdef HAVE_DMAPI
- if (mp->m_flags & XFS_MOUNT_DMAPI) {
- error = XFS_SEND_PREUNMOUNT(mp,
- rip, DM_RIGHT_NULL, rip, DM_RIGHT_NULL,
- NULL, NULL, 0, 0,
- (mp->m_dmevmask & (1<<DM_EVENT_PREUNMOUNT))?
- 0:DM_FLAGS_UNWANTED);
- if (error)
- return XFS_ERROR(error);
- unmount_event_wanted = 1;
- unmount_event_flags = (mp->m_dmevmask & (1<<DM_EVENT_UNMOUNT))?
- 0 : DM_FLAGS_UNWANTED;
- }
-#endif
-
- /*
- * Blow away any referenced inode in the filestreams cache.
- * This can and will cause log traffic as inodes go inactive
- * here.
- */
- xfs_filestream_unmount(mp);
-
- XFS_bflush(mp->m_ddev_targp);
- error = xfs_unmount_flush(mp, 0);
- if (error)
- goto out;
-
- ASSERT(vn_count(rvp) == 1);
-
- /*
- * Drop the reference count
- */
- IRELE(rip);
-
- /*
- * If we're forcing a shutdown, typically because of a media error,
- * we want to make sure we invalidate dirty pages that belong to
- * referenced vnodes as well.
- */
- if (XFS_FORCED_SHUTDOWN(mp)) {
- error = xfs_sync(mp, SYNC_WAIT | SYNC_CLOSE);
- ASSERT(error != EFSCORRUPTED);
- }
- xfs_unmountfs_needed = 1;
-
-out:
- /* Send DMAPI event, if required.
- * Then do xfs_unmountfs() if needed.
- * Then return error (or zero).
- */
- if (unmount_event_wanted) {
- /* Note: mp structure must still exist for
- * XFS_SEND_UNMOUNT() call.
- */
- XFS_SEND_UNMOUNT(mp, error == 0 ? rip : NULL,
- DM_RIGHT_NULL, 0, error, unmount_event_flags);
- }
- if (xfs_unmountfs_needed) {
- /*
- * Call common unmount function to flush to disk
- * and free the super block buffer & mount structures.
- */
- xfs_unmountfs(mp, credp);
- xfs_qmops_put(mp);
- xfs_dmops_put(mp);
- kmem_free(mp, sizeof(xfs_mount_t));
- }
-
- return XFS_ERROR(error);
-}
-
STATIC void
xfs_quiesce_fs(
xfs_mount_t *mp)
@@ -694,30 +114,6 @@ xfs_attr_quiesce(
xfs_unmountfs_writesb(mp);
}
-int
-xfs_mntupdate(
- struct xfs_mount *mp,
- int *flags,
- struct xfs_mount_args *args)
-{
- if (!(*flags & MS_RDONLY)) { /* rw/ro -> rw */
- if (mp->m_flags & XFS_MOUNT_RDONLY)
- mp->m_flags &= ~XFS_MOUNT_RDONLY;
- if (args->flags & XFSMNT_BARRIER) {
- mp->m_flags |= XFS_MOUNT_BARRIER;
- xfs_mountfs_check_barriers(mp);
- } else {
- mp->m_flags &= ~XFS_MOUNT_BARRIER;
- }
- } else if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { /* rw -> ro */
- xfs_filestream_flush(mp);
- xfs_sync(mp, SYNC_DATA_QUIESCE);
- xfs_attr_quiesce(mp);
- mp->m_flags |= XFS_MOUNT_RDONLY;
- }
- return 0;
-}
-
/*
* xfs_unmount_flush implements a set of flush operation on special
* inodes, which are needed as a separate set of operations so that
@@ -732,7 +128,6 @@ xfs_unmount_flush(
xfs_inode_t *rip = mp->m_rootip;
xfs_inode_t *rbmip;
xfs_inode_t *rsumip = NULL;
- bhv_vnode_t *rvp = XFS_ITOV(rip);
int error;
xfs_ilock(rip, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
@@ -750,7 +145,7 @@ xfs_unmount_flush(
if (error == EFSCORRUPTED)
goto fscorrupt_out;
- ASSERT(vn_count(XFS_ITOV(rbmip)) == 1);
+ ASSERT(vn_count(VFS_I(rbmip)) == 1);
rsumip = mp->m_rsumip;
xfs_ilock(rsumip, XFS_ILOCK_EXCL);
@@ -761,7 +156,7 @@ xfs_unmount_flush(
if (error == EFSCORRUPTED)
goto fscorrupt_out;
- ASSERT(vn_count(XFS_ITOV(rsumip)) == 1);
+ ASSERT(vn_count(VFS_I(rsumip)) == 1);
}
/*
@@ -771,7 +166,7 @@ xfs_unmount_flush(
if (error == EFSCORRUPTED)
goto fscorrupt_out2;
- if (vn_count(rvp) != 1 && !relocation) {
+ if (vn_count(VFS_I(rip)) != 1 && !relocation) {
xfs_iunlock(rip, XFS_ILOCK_EXCL);
return XFS_ERROR(EBUSY);
}
@@ -888,7 +283,7 @@ xfs_sync_inodes(
int *bypassed)
{
xfs_inode_t *ip = NULL;
- bhv_vnode_t *vp = NULL;
+ struct inode *vp = NULL;
int error;
int last_error;
uint64_t fflag;
@@ -1008,7 +403,7 @@ xfs_sync_inodes(
continue;
}
- vp = XFS_ITOV_NULL(ip);
+ vp = VFS_I(ip);
/*
* If the vnode is gone then this is being torn down,
@@ -1048,7 +443,7 @@ xfs_sync_inodes(
if (XFS_FORCED_SHUTDOWN(mp) && !(flags & SYNC_CLOSE)) {
XFS_MOUNT_IUNLOCK(mp);
- kmem_free(ipointer, sizeof(xfs_iptr_t));
+ kmem_free(ipointer);
return 0;
}
@@ -1083,7 +478,7 @@ xfs_sync_inodes(
IPOINTER_INSERT(ip, mp);
xfs_ilock(ip, lock_flags);
- ASSERT(vp == XFS_ITOV(ip));
+ ASSERT(vp == VFS_I(ip));
ASSERT(ip->i_mount == mp);
vnode_refed = B_TRUE;
@@ -1194,7 +589,7 @@ xfs_sync_inodes(
}
XFS_MOUNT_IUNLOCK(mp);
ASSERT(ipointer_in == B_FALSE);
- kmem_free(ipointer, sizeof(xfs_iptr_t));
+ kmem_free(ipointer);
return XFS_ERROR(error);
}
@@ -1224,7 +619,7 @@ xfs_sync_inodes(
ASSERT(ipointer_in == B_FALSE);
- kmem_free(ipointer, sizeof(xfs_iptr_t));
+ kmem_free(ipointer);
return XFS_ERROR(last_error);
}
diff --git a/fs/xfs/xfs_vfsops.h b/fs/xfs/xfs_vfsops.h
index 1688817c55e..a74b05087da 100644
--- a/fs/xfs/xfs_vfsops.h
+++ b/fs/xfs/xfs_vfsops.h
@@ -8,11 +8,6 @@ struct kstatfs;
struct xfs_mount;
struct xfs_mount_args;
-int xfs_mount(struct xfs_mount *mp, struct xfs_mount_args *args,
- struct cred *credp);
-int xfs_unmount(struct xfs_mount *mp, int flags, struct cred *credp);
-int xfs_mntupdate(struct xfs_mount *mp, int *flags,
- struct xfs_mount_args *args);
int xfs_sync(struct xfs_mount *mp, int flags);
void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
int lnnum);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index e475e3717eb..8b6812f66a1 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -75,26 +75,23 @@ xfs_open(
return 0;
}
-/*
- * xfs_setattr
- */
int
xfs_setattr(
- xfs_inode_t *ip,
- bhv_vattr_t *vap,
+ struct xfs_inode *ip,
+ struct iattr *iattr,
int flags,
cred_t *credp)
{
xfs_mount_t *mp = ip->i_mount;
+ struct inode *inode = VFS_I(ip);
+ int mask = iattr->ia_valid;
xfs_trans_t *tp;
- int mask;
int code;
uint lock_flags;
uint commit_flags=0;
uid_t uid=0, iuid=0;
gid_t gid=0, igid=0;
int timeflags = 0;
- xfs_prid_t projid=0, iprojid=0;
struct xfs_dquot *udqp, *gdqp, *olddquot1, *olddquot2;
int file_owner;
int need_iolock = 1;
@@ -104,30 +101,9 @@ xfs_setattr(
if (mp->m_flags & XFS_MOUNT_RDONLY)
return XFS_ERROR(EROFS);
- /*
- * Cannot set certain attributes.
- */
- mask = vap->va_mask;
- if (mask & XFS_AT_NOSET) {
- return XFS_ERROR(EINVAL);
- }
-
if (XFS_FORCED_SHUTDOWN(mp))
return XFS_ERROR(EIO);
- /*
- * Timestamps do not need to be logged and hence do not
- * need to be done within a transaction.
- */
- if (mask & XFS_AT_UPDTIMES) {
- ASSERT((mask & ~XFS_AT_UPDTIMES) == 0);
- timeflags = ((mask & XFS_AT_UPDATIME) ? XFS_ICHGTIME_ACC : 0) |
- ((mask & XFS_AT_UPDCTIME) ? XFS_ICHGTIME_CHG : 0) |
- ((mask & XFS_AT_UPDMTIME) ? XFS_ICHGTIME_MOD : 0);
- xfs_ichgtime(ip, timeflags);
- return 0;
- }
-
olddquot1 = olddquot2 = NULL;
udqp = gdqp = NULL;
@@ -139,28 +115,22 @@ xfs_setattr(
* If the IDs do change before we take the ilock, we're covered
* because the i_*dquot fields will get updated anyway.
*/
- if (XFS_IS_QUOTA_ON(mp) &&
- (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID))) {
+ if (XFS_IS_QUOTA_ON(mp) && (mask & (ATTR_UID|ATTR_GID))) {
uint qflags = 0;
- if ((mask & XFS_AT_UID) && XFS_IS_UQUOTA_ON(mp)) {
- uid = vap->va_uid;
+ if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp)) {
+ uid = iattr->ia_uid;
qflags |= XFS_QMOPT_UQUOTA;
} else {
uid = ip->i_d.di_uid;
}
- if ((mask & XFS_AT_GID) && XFS_IS_GQUOTA_ON(mp)) {
- gid = vap->va_gid;
+ if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) {
+ gid = iattr->ia_gid;
qflags |= XFS_QMOPT_GQUOTA;
} else {
gid = ip->i_d.di_gid;
}
- if ((mask & XFS_AT_PROJID) && XFS_IS_PQUOTA_ON(mp)) {
- projid = vap->va_projid;
- qflags |= XFS_QMOPT_PQUOTA;
- } else {
- projid = ip->i_d.di_projid;
- }
+
/*
* We take a reference when we initialize udqp and gdqp,
* so it is important that we never blindly double trip on
@@ -168,8 +138,8 @@ xfs_setattr(
*/
ASSERT(udqp == NULL);
ASSERT(gdqp == NULL);
- code = XFS_QM_DQVOPALLOC(mp, ip, uid, gid, projid, qflags,
- &udqp, &gdqp);
+ code = XFS_QM_DQVOPALLOC(mp, ip, uid, gid, ip->i_d.di_projid,
+ qflags, &udqp, &gdqp);
if (code)
return code;
}
@@ -180,10 +150,10 @@ xfs_setattr(
*/
tp = NULL;
lock_flags = XFS_ILOCK_EXCL;
- if (flags & ATTR_NOLOCK)
+ if (flags & XFS_ATTR_NOLOCK)
need_iolock = 0;
- if (!(mask & XFS_AT_SIZE)) {
- if ((mask != (XFS_AT_CTIME|XFS_AT_ATIME|XFS_AT_MTIME)) ||
+ if (!(mask & ATTR_SIZE)) {
+ if ((mask != (ATTR_CTIME|ATTR_ATIME|ATTR_MTIME)) ||
(mp->m_flags & XFS_MOUNT_WSYNC)) {
tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
commit_flags = 0;
@@ -196,10 +166,10 @@ xfs_setattr(
}
} else {
if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) &&
- !(flags & ATTR_DMI)) {
+ !(flags & XFS_ATTR_DMI)) {
int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR;
code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, ip,
- vap->va_size, 0, dmflags, NULL);
+ iattr->ia_size, 0, dmflags, NULL);
if (code) {
lock_flags = 0;
goto error_return;
@@ -212,16 +182,14 @@ xfs_setattr(
xfs_ilock(ip, lock_flags);
/* boolean: are we the file owner? */
- file_owner = (current_fsuid(credp) == ip->i_d.di_uid);
+ file_owner = (current_fsuid() == ip->i_d.di_uid);
/*
* Change various properties of a file.
* Only the owner or users with CAP_FOWNER
* capability may do these things.
*/
- if (mask &
- (XFS_AT_MODE|XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_UID|
- XFS_AT_GID|XFS_AT_PROJID)) {
+ if (mask & (ATTR_MODE|ATTR_UID|ATTR_GID)) {
/*
* CAP_FOWNER overrides the following restrictions:
*
@@ -245,21 +213,21 @@ xfs_setattr(
* IDs of the calling process shall match the group owner of
* the file when setting the set-group-ID bit on that file
*/
- if (mask & XFS_AT_MODE) {
+ if (mask & ATTR_MODE) {
mode_t m = 0;
- if ((vap->va_mode & S_ISUID) && !file_owner)
+ if ((iattr->ia_mode & S_ISUID) && !file_owner)
m |= S_ISUID;
- if ((vap->va_mode & S_ISGID) &&
+ if ((iattr->ia_mode & S_ISGID) &&
!in_group_p((gid_t)ip->i_d.di_gid))
m |= S_ISGID;
#if 0
/* Linux allows this, Irix doesn't. */
- if ((vap->va_mode & S_ISVTX) && !S_ISDIR(ip->i_d.di_mode))
+ if ((iattr->ia_mode & S_ISVTX) && !S_ISDIR(ip->i_d.di_mode))
m |= S_ISVTX;
#endif
if (m && !capable(CAP_FSETID))
- vap->va_mode &= ~m;
+ iattr->ia_mode &= ~m;
}
}
@@ -270,7 +238,7 @@ xfs_setattr(
* and can change the group id only to a group of which he
* or she is a member.
*/
- if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
+ if (mask & (ATTR_UID|ATTR_GID)) {
/*
* These IDs could have changed since we last looked at them.
* But, we're assured that if the ownership did change
@@ -278,12 +246,9 @@ xfs_setattr(
* would have changed also.
*/
iuid = ip->i_d.di_uid;
- iprojid = ip->i_d.di_projid;
igid = ip->i_d.di_gid;
- gid = (mask & XFS_AT_GID) ? vap->va_gid : igid;
- uid = (mask & XFS_AT_UID) ? vap->va_uid : iuid;
- projid = (mask & XFS_AT_PROJID) ? (xfs_prid_t)vap->va_projid :
- iprojid;
+ gid = (mask & ATTR_GID) ? iattr->ia_gid : igid;
+ uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid;
/*
* CAP_CHOWN overrides the following restrictions:
@@ -303,11 +268,10 @@ xfs_setattr(
goto error_return;
}
/*
- * Do a quota reservation only if uid/projid/gid is actually
+ * Do a quota reservation only if uid/gid is actually
* going to change.
*/
if ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
- (XFS_IS_PQUOTA_ON(mp) && iprojid != projid) ||
(XFS_IS_GQUOTA_ON(mp) && igid != gid)) {
ASSERT(tp);
code = XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, udqp, gdqp,
@@ -321,13 +285,13 @@ xfs_setattr(
/*
* Truncate file. Must have write permission and not be a directory.
*/
- if (mask & XFS_AT_SIZE) {
+ if (mask & ATTR_SIZE) {
/* Short circuit the truncate case for zero length files */
- if ((vap->va_size == 0) &&
- (ip->i_size == 0) && (ip->i_d.di_nextents == 0)) {
+ if (iattr->ia_size == 0 &&
+ ip->i_size == 0 && ip->i_d.di_nextents == 0) {
xfs_iunlock(ip, XFS_ILOCK_EXCL);
lock_flags &= ~XFS_ILOCK_EXCL;
- if (mask & XFS_AT_CTIME)
+ if (mask & ATTR_CTIME)
xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
code = 0;
goto error_return;
@@ -350,9 +314,9 @@ xfs_setattr(
/*
* Change file access or modified times.
*/
- if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
+ if (mask & (ATTR_ATIME|ATTR_MTIME)) {
if (!file_owner) {
- if ((flags & ATTR_UTIME) &&
+ if ((mask & (ATTR_MTIME_SET|ATTR_ATIME_SET)) &&
!capable(CAP_FOWNER)) {
code = XFS_ERROR(EPERM);
goto error_return;
@@ -361,90 +325,23 @@ xfs_setattr(
}
/*
- * Change extent size or realtime flag.
- */
- if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
- /*
- * Can't change extent size if any extents are allocated.
- */
- if (ip->i_d.di_nextents && (mask & XFS_AT_EXTSIZE) &&
- ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
- vap->va_extsize) ) {
- code = XFS_ERROR(EINVAL); /* EFBIG? */
- goto error_return;
- }
-
- /*
- * Can't change realtime flag if any extents are allocated.
- */
- if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
- (mask & XFS_AT_XFLAGS) &&
- (XFS_IS_REALTIME_INODE(ip)) !=
- (vap->va_xflags & XFS_XFLAG_REALTIME)) {
- code = XFS_ERROR(EINVAL); /* EFBIG? */
- goto error_return;
- }
- /*
- * Extent size must be a multiple of the appropriate block
- * size, if set at all.
- */
- if ((mask & XFS_AT_EXTSIZE) && vap->va_extsize != 0) {
- xfs_extlen_t size;
-
- if (XFS_IS_REALTIME_INODE(ip) ||
- ((mask & XFS_AT_XFLAGS) &&
- (vap->va_xflags & XFS_XFLAG_REALTIME))) {
- size = mp->m_sb.sb_rextsize <<
- mp->m_sb.sb_blocklog;
- } else {
- size = mp->m_sb.sb_blocksize;
- }
- if (vap->va_extsize % size) {
- code = XFS_ERROR(EINVAL);
- goto error_return;
- }
- }
- /*
- * If realtime flag is set then must have realtime data.
- */
- if ((mask & XFS_AT_XFLAGS) &&
- (vap->va_xflags & XFS_XFLAG_REALTIME)) {
- if ((mp->m_sb.sb_rblocks == 0) ||
- (mp->m_sb.sb_rextsize == 0) ||
- (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
- code = XFS_ERROR(EINVAL);
- goto error_return;
- }
- }
-
- /*
- * Can't modify an immutable/append-only file unless
- * we have appropriate permission.
- */
- if ((mask & XFS_AT_XFLAGS) &&
- (ip->i_d.di_flags &
- (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) ||
- (vap->va_xflags &
- (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
- !capable(CAP_LINUX_IMMUTABLE)) {
- code = XFS_ERROR(EPERM);
- goto error_return;
- }
- }
-
- /*
* Now we can make the changes. Before we join the inode
- * to the transaction, if XFS_AT_SIZE is set then take care of
+ * to the transaction, if ATTR_SIZE is set then take care of
* the part of the truncation that must be done without the
* inode lock. This needs to be done before joining the inode
* to the transaction, because the inode cannot be unlocked
* once it is a part of the transaction.
*/
- if (mask & XFS_AT_SIZE) {
+ if (mask & ATTR_SIZE) {
code = 0;
- if ((vap->va_size > ip->i_size) &&
- (flags & ATTR_NOSIZETOK) == 0) {
- code = xfs_igrow_start(ip, vap->va_size, credp);
+ if (iattr->ia_size > ip->i_size) {
+ /*
+ * Do the first part of growing a file: zero any data
+ * in the last block that is beyond the old EOF. We
+ * need to do this before the inode is joined to the
+ * transaction to modify the i_size.
+ */
+ code = xfs_zero_eof(ip, iattr->ia_size, ip->i_size);
}
xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -461,10 +358,10 @@ xfs_setattr(
* not within the range we care about here.
*/
if (!code &&
- (ip->i_size != ip->i_d.di_size) &&
- (vap->va_size > ip->i_d.di_size)) {
+ ip->i_size != ip->i_d.di_size &&
+ iattr->ia_size > ip->i_d.di_size) {
code = xfs_flush_pages(ip,
- ip->i_d.di_size, vap->va_size,
+ ip->i_d.di_size, iattr->ia_size,
XFS_B_ASYNC, FI_NONE);
}
@@ -472,7 +369,7 @@ xfs_setattr(
vn_iowait(ip);
if (!code)
- code = xfs_itruncate_data(ip, vap->va_size);
+ code = xfs_itruncate_data(ip, iattr->ia_size);
if (code) {
ASSERT(tp == NULL);
lock_flags &= ~XFS_ILOCK_EXCL;
@@ -501,28 +398,30 @@ xfs_setattr(
/*
* Truncate file. Must have write permission and not be a directory.
*/
- if (mask & XFS_AT_SIZE) {
+ if (mask & ATTR_SIZE) {
/*
* Only change the c/mtime if we are changing the size
* or we are explicitly asked to change it. This handles
* the semantic difference between truncate() and ftruncate()
* as implemented in the VFS.
*/
- if (vap->va_size != ip->i_size || (mask & XFS_AT_CTIME))
+ if (iattr->ia_size != ip->i_size || (mask & ATTR_CTIME))
timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
- if (vap->va_size > ip->i_size) {
- xfs_igrow_finish(tp, ip, vap->va_size,
- !(flags & ATTR_DMI));
- } else if ((vap->va_size <= ip->i_size) ||
- ((vap->va_size == 0) && ip->i_d.di_nextents)) {
+ if (iattr->ia_size > ip->i_size) {
+ ip->i_d.di_size = iattr->ia_size;
+ ip->i_size = iattr->ia_size;
+ if (!(flags & XFS_ATTR_DMI))
+ xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ } else if (iattr->ia_size <= ip->i_size ||
+ (iattr->ia_size == 0 && ip->i_d.di_nextents)) {
/*
* signal a sync transaction unless
* we're truncating an already unlinked
* file on a wsync filesystem
*/
- code = xfs_itruncate_finish(&tp, ip,
- (xfs_fsize_t)vap->va_size,
+ code = xfs_itruncate_finish(&tp, ip, iattr->ia_size,
XFS_DATA_FORK,
((ip->i_d.di_nlink != 0 ||
!(mp->m_flags & XFS_MOUNT_WSYNC))
@@ -544,9 +443,12 @@ xfs_setattr(
/*
* Change file access modes.
*/
- if (mask & XFS_AT_MODE) {
+ if (mask & ATTR_MODE) {
ip->i_d.di_mode &= S_IFMT;
- ip->i_d.di_mode |= vap->va_mode & ~S_IFMT;
+ ip->i_d.di_mode |= iattr->ia_mode & ~S_IFMT;
+
+ inode->i_mode &= S_IFMT;
+ inode->i_mode |= iattr->ia_mode & ~S_IFMT;
xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
timeflags |= XFS_ICHGTIME_CHG;
@@ -559,7 +461,7 @@ xfs_setattr(
* and can change the group id only to a group of which he
* or she is a member.
*/
- if (mask & (XFS_AT_UID|XFS_AT_GID|XFS_AT_PROJID)) {
+ if (mask & (ATTR_UID|ATTR_GID)) {
/*
* CAP_FSETID overrides the following restrictions:
*
@@ -577,39 +479,24 @@ xfs_setattr(
*/
if (iuid != uid) {
if (XFS_IS_UQUOTA_ON(mp)) {
- ASSERT(mask & XFS_AT_UID);
+ ASSERT(mask & ATTR_UID);
ASSERT(udqp);
olddquot1 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
&ip->i_udquot, udqp);
}
ip->i_d.di_uid = uid;
+ inode->i_uid = uid;
}
if (igid != gid) {
if (XFS_IS_GQUOTA_ON(mp)) {
ASSERT(!XFS_IS_PQUOTA_ON(mp));
- ASSERT(mask & XFS_AT_GID);
+ ASSERT(mask & ATTR_GID);
ASSERT(gdqp);
olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
&ip->i_gdquot, gdqp);
}
ip->i_d.di_gid = gid;
- }
- if (iprojid != projid) {
- if (XFS_IS_PQUOTA_ON(mp)) {
- ASSERT(!XFS_IS_GQUOTA_ON(mp));
- ASSERT(mask & XFS_AT_PROJID);
- ASSERT(gdqp);
- olddquot2 = XFS_QM_DQVOPCHOWN(mp, tp, ip,
- &ip->i_gdquot, gdqp);
- }
- ip->i_d.di_projid = projid;
- /*
- * We may have to rev the inode as well as
- * the superblock version number since projids didn't
- * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
- */
- if (ip->i_d.di_version == XFS_DINODE_VERSION_1)
- xfs_bump_ino_vers2(tp, ip);
+ inode->i_gid = gid;
}
xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
@@ -620,82 +507,33 @@ xfs_setattr(
/*
* Change file access or modified times.
*/
- if (mask & (XFS_AT_ATIME|XFS_AT_MTIME)) {
- if (mask & XFS_AT_ATIME) {
- ip->i_d.di_atime.t_sec = vap->va_atime.tv_sec;
- ip->i_d.di_atime.t_nsec = vap->va_atime.tv_nsec;
+ if (mask & (ATTR_ATIME|ATTR_MTIME)) {
+ if (mask & ATTR_ATIME) {
+ inode->i_atime = iattr->ia_atime;
+ ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
+ ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
ip->i_update_core = 1;
- timeflags &= ~XFS_ICHGTIME_ACC;
}
- if (mask & XFS_AT_MTIME) {
- ip->i_d.di_mtime.t_sec = vap->va_mtime.tv_sec;
- ip->i_d.di_mtime.t_nsec = vap->va_mtime.tv_nsec;
+ if (mask & ATTR_MTIME) {
+ inode->i_mtime = iattr->ia_mtime;
+ ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
+ ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
timeflags &= ~XFS_ICHGTIME_MOD;
timeflags |= XFS_ICHGTIME_CHG;
}
- if (tp && (flags & ATTR_UTIME))
+ if (tp && (mask & (ATTR_MTIME_SET|ATTR_ATIME_SET)))
xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
}
/*
- * Change XFS-added attributes.
- */
- if (mask & (XFS_AT_EXTSIZE|XFS_AT_XFLAGS)) {
- if (mask & XFS_AT_EXTSIZE) {
- /*
- * Converting bytes to fs blocks.
- */
- ip->i_d.di_extsize = vap->va_extsize >>
- mp->m_sb.sb_blocklog;
- }
- if (mask & XFS_AT_XFLAGS) {
- uint di_flags;
-
- /* can't set PREALLOC this way, just preserve it */
- di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
- if (vap->va_xflags & XFS_XFLAG_IMMUTABLE)
- di_flags |= XFS_DIFLAG_IMMUTABLE;
- if (vap->va_xflags & XFS_XFLAG_APPEND)
- di_flags |= XFS_DIFLAG_APPEND;
- if (vap->va_xflags & XFS_XFLAG_SYNC)
- di_flags |= XFS_DIFLAG_SYNC;
- if (vap->va_xflags & XFS_XFLAG_NOATIME)
- di_flags |= XFS_DIFLAG_NOATIME;
- if (vap->va_xflags & XFS_XFLAG_NODUMP)
- di_flags |= XFS_DIFLAG_NODUMP;
- if (vap->va_xflags & XFS_XFLAG_PROJINHERIT)
- di_flags |= XFS_DIFLAG_PROJINHERIT;
- if (vap->va_xflags & XFS_XFLAG_NODEFRAG)
- di_flags |= XFS_DIFLAG_NODEFRAG;
- if (vap->va_xflags & XFS_XFLAG_FILESTREAM)
- di_flags |= XFS_DIFLAG_FILESTREAM;
- if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
- if (vap->va_xflags & XFS_XFLAG_RTINHERIT)
- di_flags |= XFS_DIFLAG_RTINHERIT;
- if (vap->va_xflags & XFS_XFLAG_NOSYMLINKS)
- di_flags |= XFS_DIFLAG_NOSYMLINKS;
- if (vap->va_xflags & XFS_XFLAG_EXTSZINHERIT)
- di_flags |= XFS_DIFLAG_EXTSZINHERIT;
- } else if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
- if (vap->va_xflags & XFS_XFLAG_REALTIME)
- di_flags |= XFS_DIFLAG_REALTIME;
- if (vap->va_xflags & XFS_XFLAG_EXTSIZE)
- di_flags |= XFS_DIFLAG_EXTSIZE;
- }
- ip->i_d.di_flags = di_flags;
- }
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- timeflags |= XFS_ICHGTIME_CHG;
- }
-
- /*
- * Change file inode change time only if XFS_AT_CTIME set
+ * Change file inode change time only if ATTR_CTIME set
* AND we have been called by a DMI function.
*/
- if ( (flags & ATTR_DMI) && (mask & XFS_AT_CTIME) ) {
- ip->i_d.di_ctime.t_sec = vap->va_ctime.tv_sec;
- ip->i_d.di_ctime.t_nsec = vap->va_ctime.tv_nsec;
+ if ((flags & XFS_ATTR_DMI) && (mask & ATTR_CTIME)) {
+ inode->i_ctime = iattr->ia_ctime;
+ ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
+ ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
ip->i_update_core = 1;
timeflags &= ~XFS_ICHGTIME_CHG;
}
@@ -704,7 +542,7 @@ xfs_setattr(
* Send out timestamp changes that need to be set to the
* current time. Not done when called by a DMI function.
*/
- if (timeflags && !(flags & ATTR_DMI))
+ if (timeflags && !(flags & XFS_ATTR_DMI))
xfs_ichgtime(ip, timeflags);
XFS_STATS_INC(xs_ig_attrchg);
@@ -742,7 +580,7 @@ xfs_setattr(
}
if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE) &&
- !(flags & ATTR_DMI)) {
+ !(flags & XFS_ATTR_DMI)) {
(void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, ip, DM_RIGHT_NULL,
NULL, DM_RIGHT_NULL, NULL, NULL,
0, 0, AT_DELAY_FLAG(flags));
@@ -875,7 +713,7 @@ xfs_fsync(
return XFS_ERROR(EIO);
/* capture size updates in I/O completion before writing the inode. */
- error = filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping);
+ error = filemap_fdatawait(VFS_I(ip)->i_mapping);
if (error)
return XFS_ERROR(error);
@@ -1321,7 +1159,6 @@ int
xfs_release(
xfs_inode_t *ip)
{
- bhv_vnode_t *vp = XFS_ITOV(ip);
xfs_mount_t *mp = ip->i_mount;
int error;
@@ -1356,13 +1193,13 @@ xfs_release(
* be exposed to that problem.
*/
truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
- if (truncated && VN_DIRTY(vp) && ip->i_delayed_blks > 0)
+ if (truncated && VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0)
xfs_flush_pages(ip, 0, -1, XFS_B_ASYNC, FI_NONE);
}
if (ip->i_d.di_nlink != 0) {
if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
- ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
+ ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
ip->i_delayed_blks > 0)) &&
(ip->i_df.if_flags & XFS_IFEXTENTS)) &&
(!(ip->i_d.di_flags &
@@ -1388,7 +1225,6 @@ int
xfs_inactive(
xfs_inode_t *ip)
{
- bhv_vnode_t *vp = XFS_ITOV(ip);
xfs_bmap_free_t free_list;
xfs_fsblock_t first_block;
int committed;
@@ -1403,7 +1239,7 @@ xfs_inactive(
* If the inode is already free, then there can be nothing
* to clean up here.
*/
- if (ip->i_d.di_mode == 0 || VN_BAD(vp)) {
+ if (ip->i_d.di_mode == 0 || VN_BAD(VFS_I(ip))) {
ASSERT(ip->i_df.if_real_bytes == 0);
ASSERT(ip->i_df.if_broot_bytes == 0);
return VN_INACTIVE_CACHE;
@@ -1433,7 +1269,7 @@ xfs_inactive(
if (ip->i_d.di_nlink != 0) {
if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
- ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
+ ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
ip->i_delayed_blks > 0)) &&
(ip->i_df.if_flags & XFS_IFEXTENTS) &&
(!(ip->i_d.di_flags &
@@ -1601,12 +1437,18 @@ xfs_inactive(
return VN_INACTIVE_CACHE;
}
-
+/*
+ * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
+ * is allowed, otherwise it has to be an exact match. If a CI match is found,
+ * ci_name->name will point to a the actual name (caller must free) or
+ * will be set to NULL if an exact match is found.
+ */
int
xfs_lookup(
xfs_inode_t *dp,
struct xfs_name *name,
- xfs_inode_t **ipp)
+ xfs_inode_t **ipp,
+ struct xfs_name *ci_name)
{
xfs_ino_t inum;
int error;
@@ -1618,7 +1460,7 @@ xfs_lookup(
return XFS_ERROR(EIO);
lock_mode = xfs_ilock_map_shared(dp);
- error = xfs_dir_lookup(NULL, dp, name, &inum);
+ error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
xfs_iunlock_map_shared(dp, lock_mode);
if (error)
@@ -1626,12 +1468,15 @@ xfs_lookup(
error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp, 0);
if (error)
- goto out;
+ goto out_free_name;
xfs_itrace_ref(*ipp);
return 0;
- out:
+out_free_name:
+ if (ci_name)
+ kmem_free(ci_name->name);
+out:
*ipp = NULL;
return error;
}
@@ -1688,7 +1533,7 @@ xfs_create(
* Make sure that we have allocated dquot(s) on disk.
*/
error = XFS_QM_DQVOPALLOC(mp, dp,
- current_fsuid(credp), current_fsgid(credp), prid,
+ current_fsuid(), current_fsgid(), prid,
XFS_QMOPT_QUOTALL|XFS_QMOPT_INHERIT, &udqp, &gdqp);
if (error)
goto std_return;
@@ -1860,111 +1705,6 @@ std_return:
}
#ifdef DEBUG
-/*
- * Some counters to see if (and how often) we are hitting some deadlock
- * prevention code paths.
- */
-
-int xfs_rm_locks;
-int xfs_rm_lock_delays;
-int xfs_rm_attempts;
-#endif
-
-/*
- * The following routine will lock the inodes associated with the
- * directory and the named entry in the directory. The locks are
- * acquired in increasing inode number.
- *
- * If the entry is "..", then only the directory is locked. The
- * vnode ref count will still include that from the .. entry in
- * this case.
- *
- * There is a deadlock we need to worry about. If the locked directory is
- * in the AIL, it might be blocking up the log. The next inode we lock
- * could be already locked by another thread waiting for log space (e.g
- * a permanent log reservation with a long running transaction (see
- * xfs_itruncate_finish)). To solve this, we must check if the directory
- * is in the ail and use lock_nowait. If we can't lock, we need to
- * drop the inode lock on the directory and try again. xfs_iunlock will
- * potentially push the tail if we were holding up the log.
- */
-STATIC int
-xfs_lock_dir_and_entry(
- xfs_inode_t *dp,
- xfs_inode_t *ip) /* inode of entry 'name' */
-{
- int attempts;
- xfs_ino_t e_inum;
- xfs_inode_t *ips[2];
- xfs_log_item_t *lp;
-
-#ifdef DEBUG
- xfs_rm_locks++;
-#endif
- attempts = 0;
-
-again:
- xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
-
- e_inum = ip->i_ino;
-
- xfs_itrace_ref(ip);
-
- /*
- * We want to lock in increasing inum. Since we've already
- * acquired the lock on the directory, we may need to release
- * if if the inum of the entry turns out to be less.
- */
- if (e_inum > dp->i_ino) {
- /*
- * We are already in the right order, so just
- * lock on the inode of the entry.
- * We need to use nowait if dp is in the AIL.
- */
-
- lp = (xfs_log_item_t *)dp->i_itemp;
- if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
- if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
- attempts++;
-#ifdef DEBUG
- xfs_rm_attempts++;
-#endif
-
- /*
- * Unlock dp and try again.
- * xfs_iunlock will try to push the tail
- * if the inode is in the AIL.
- */
-
- xfs_iunlock(dp, XFS_ILOCK_EXCL);
-
- if ((attempts % 5) == 0) {
- delay(1); /* Don't just spin the CPU */
-#ifdef DEBUG
- xfs_rm_lock_delays++;
-#endif
- }
- goto again;
- }
- } else {
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- }
- } else if (e_inum < dp->i_ino) {
- xfs_iunlock(dp, XFS_ILOCK_EXCL);
-
- ips[0] = ip;
- ips[1] = dp;
- xfs_lock_inodes(ips, 2, XFS_ILOCK_EXCL);
- }
- /* else e_inum == dp->i_ino */
- /* This can happen if we're asked to lock /x/..
- * the entry is "..", which is also the parent directory.
- */
-
- return 0;
-}
-
-#ifdef DEBUG
int xfs_locked_n;
int xfs_small_retries;
int xfs_middle_retries;
@@ -2098,12 +1838,52 @@ again:
#endif
}
-#ifdef DEBUG
-#define REMOVE_DEBUG_TRACE(x) {remove_which_error_return = (x);}
-int remove_which_error_return = 0;
-#else /* ! DEBUG */
-#define REMOVE_DEBUG_TRACE(x)
-#endif /* ! DEBUG */
+/*
+ * xfs_lock_two_inodes() can only be used to lock one type of lock
+ * at a time - the iolock or the ilock, but not both at once. If
+ * we lock both at once, lockdep will report false positives saying
+ * we have violated locking orders.
+ */
+void
+xfs_lock_two_inodes(
+ xfs_inode_t *ip0,
+ xfs_inode_t *ip1,
+ uint lock_mode)
+{
+ xfs_inode_t *temp;
+ int attempts = 0;
+ xfs_log_item_t *lp;
+
+ if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
+ ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0);
+ ASSERT(ip0->i_ino != ip1->i_ino);
+
+ if (ip0->i_ino > ip1->i_ino) {
+ temp = ip0;
+ ip0 = ip1;
+ ip1 = temp;
+ }
+
+ again:
+ xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0));
+
+ /*
+ * If the first lock we have locked is in the AIL, we must TRY to get
+ * the second lock. If we can't get it, we must release the first one
+ * and try again.
+ */
+ lp = (xfs_log_item_t *)ip0->i_itemp;
+ if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
+ if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) {
+ xfs_iunlock(ip0, lock_mode);
+ if ((++attempts % 5) == 0)
+ delay(1); /* Don't just spin the CPU */
+ goto again;
+ }
+ } else {
+ xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1));
+ }
+}
int
xfs_remove(
@@ -2113,6 +1893,7 @@ xfs_remove(
{
xfs_mount_t *mp = dp->i_mount;
xfs_trans_t *tp = NULL;
+ int is_dir = S_ISDIR(ip->i_d.di_mode);
int error = 0;
xfs_bmap_free_t free_list;
xfs_fsblock_t first_block;
@@ -2120,8 +1901,10 @@ xfs_remove(
int committed;
int link_zero;
uint resblks;
+ uint log_count;
xfs_itrace_entry(dp);
+ xfs_itrace_entry(ip);
if (XFS_FORCED_SHUTDOWN(mp))
return XFS_ERROR(EIO);
@@ -2134,19 +1917,23 @@ xfs_remove(
return error;
}
- xfs_itrace_entry(ip);
- xfs_itrace_ref(ip);
-
error = XFS_QM_DQATTACH(mp, dp, 0);
- if (!error)
- error = XFS_QM_DQATTACH(mp, ip, 0);
- if (error) {
- REMOVE_DEBUG_TRACE(__LINE__);
+ if (error)
goto std_return;
- }
- tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
+ error = XFS_QM_DQATTACH(mp, ip, 0);
+ if (error)
+ goto std_return;
+
+ if (is_dir) {
+ tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
+ log_count = XFS_DEFAULT_LOG_COUNT;
+ } else {
+ tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
+ log_count = XFS_REMOVE_LOG_COUNT;
+ }
cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+
/*
* We try to get the real space reservation first,
* allowing for directory btree deletion(s) implying
@@ -2158,25 +1945,19 @@ xfs_remove(
*/
resblks = XFS_REMOVE_SPACE_RES(mp);
error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
- XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
+ XFS_TRANS_PERM_LOG_RES, log_count);
if (error == ENOSPC) {
resblks = 0;
error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
- XFS_TRANS_PERM_LOG_RES, XFS_REMOVE_LOG_COUNT);
+ XFS_TRANS_PERM_LOG_RES, log_count);
}
if (error) {
ASSERT(error != ENOSPC);
- REMOVE_DEBUG_TRACE(__LINE__);
- xfs_trans_cancel(tp, 0);
- return error;
+ cancel_flags = 0;
+ goto out_trans_cancel;
}
- error = xfs_lock_dir_and_entry(dp, ip);
- if (error) {
- REMOVE_DEBUG_TRACE(__LINE__);
- xfs_trans_cancel(tp, cancel_flags);
- goto std_return;
- }
+ xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
/*
* At this point, we've gotten both the directory and the entry
@@ -2189,46 +1970,83 @@ xfs_remove(
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
/*
- * Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
+ * If we're removing a directory perform some additional validation.
*/
+ if (is_dir) {
+ ASSERT(ip->i_d.di_nlink >= 2);
+ if (ip->i_d.di_nlink != 2) {
+ error = XFS_ERROR(ENOTEMPTY);
+ goto out_trans_cancel;
+ }
+ if (!xfs_dir_isempty(ip)) {
+ error = XFS_ERROR(ENOTEMPTY);
+ goto out_trans_cancel;
+ }
+ }
+
XFS_BMAP_INIT(&free_list, &first_block);
error = xfs_dir_removename(tp, dp, name, ip->i_ino,
&first_block, &free_list, resblks);
if (error) {
ASSERT(error != ENOENT);
- REMOVE_DEBUG_TRACE(__LINE__);
- goto error1;
+ goto out_bmap_cancel;
}
xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ /*
+ * Bump the in memory generation count on the parent
+ * directory so that other can know that it has changed.
+ */
dp->i_gen++;
xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
- error = xfs_droplink(tp, ip);
- if (error) {
- REMOVE_DEBUG_TRACE(__LINE__);
- goto error1;
+ if (is_dir) {
+ /*
+ * Drop the link from ip's "..".
+ */
+ error = xfs_droplink(tp, dp);
+ if (error)
+ goto out_bmap_cancel;
+
+ /*
+ * Drop the link from dp to ip.
+ */
+ error = xfs_droplink(tp, ip);
+ if (error)
+ goto out_bmap_cancel;
+ } else {
+ /*
+ * When removing a non-directory we need to log the parent
+ * inode here for the i_gen update. For a directory this is
+ * done implicitly by the xfs_droplink call for the ".." entry.
+ */
+ xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
}
- /* Determine if this is the last link while
+ /*
+ * Drop the "." link from ip to self.
+ */
+ error = xfs_droplink(tp, ip);
+ if (error)
+ goto out_bmap_cancel;
+
+ /*
+ * Determine if this is the last link while
* we are in the transaction.
*/
- link_zero = (ip)->i_d.di_nlink==0;
+ link_zero = (ip->i_d.di_nlink == 0);
/*
* If this is a synchronous mount, make sure that the
* remove transaction goes to disk before returning to
* the user.
*/
- if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+ if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
xfs_trans_set_sync(tp);
- }
error = xfs_bmap_finish(&tp, &free_list, &committed);
- if (error) {
- REMOVE_DEBUG_TRACE(__LINE__);
- goto error_rele;
- }
+ if (error)
+ goto out_bmap_cancel;
error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
if (error)
@@ -2240,38 +2058,26 @@ xfs_remove(
* will get killed on last close in xfs_close() so we don't
* have to worry about that.
*/
- if (link_zero && xfs_inode_is_filestream(ip))
+ if (!is_dir && link_zero && xfs_inode_is_filestream(ip))
xfs_filestream_deassociate(ip);
xfs_itrace_exit(ip);
+ xfs_itrace_exit(dp);
-/* Fall through to std_return with error = 0 */
std_return:
if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
- (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
- dp, DM_RIGHT_NULL,
- NULL, DM_RIGHT_NULL,
- name->name, NULL, ip->i_d.di_mode, error, 0);
+ XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE, dp, DM_RIGHT_NULL,
+ NULL, DM_RIGHT_NULL, name->name, NULL,
+ ip->i_d.di_mode, error, 0);
}
- return error;
- error1:
- xfs_bmap_cancel(&free_list);
- cancel_flags |= XFS_TRANS_ABORT;
- xfs_trans_cancel(tp, cancel_flags);
- goto std_return;
+ return error;
- error_rele:
- /*
- * In this case make sure to not release the inode until after
- * the current transaction is aborted. Releasing it beforehand
- * can cause us to go to xfs_inactive and start a recursive
- * transaction which can easily deadlock with the current one.
- */
+ out_bmap_cancel:
xfs_bmap_cancel(&free_list);
cancel_flags |= XFS_TRANS_ABORT;
+ out_trans_cancel:
xfs_trans_cancel(tp, cancel_flags);
-
goto std_return;
}
@@ -2283,7 +2089,6 @@ xfs_link(
{
xfs_mount_t *mp = tdp->i_mount;
xfs_trans_t *tp;
- xfs_inode_t *ips[2];
int error;
xfs_bmap_free_t free_list;
xfs_fsblock_t first_block;
@@ -2331,15 +2136,7 @@ xfs_link(
goto error_return;
}
- if (sip->i_ino < tdp->i_ino) {
- ips[0] = sip;
- ips[1] = tdp;
- } else {
- ips[0] = tdp;
- ips[1] = sip;
- }
-
- xfs_lock_inodes(ips, 2, XFS_ILOCK_EXCL);
+ xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
/*
* Increment vnode ref counts since xfs_trans_commit &
@@ -2480,7 +2277,7 @@ xfs_mkdir(
* Make sure that we have allocated dquot(s) on disk.
*/
error = XFS_QM_DQVOPALLOC(mp, dp,
- current_fsuid(credp), current_fsgid(credp), prid,
+ current_fsuid(), current_fsgid(), prid,
XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
if (error)
goto std_return;
@@ -2638,186 +2435,6 @@ std_return:
}
int
-xfs_rmdir(
- xfs_inode_t *dp,
- struct xfs_name *name,
- xfs_inode_t *cdp)
-{
- xfs_mount_t *mp = dp->i_mount;
- xfs_trans_t *tp;
- int error;
- xfs_bmap_free_t free_list;
- xfs_fsblock_t first_block;
- int cancel_flags;
- int committed;
- int last_cdp_link;
- uint resblks;
-
- xfs_itrace_entry(dp);
-
- if (XFS_FORCED_SHUTDOWN(mp))
- return XFS_ERROR(EIO);
-
- if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
- error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE,
- dp, DM_RIGHT_NULL,
- NULL, DM_RIGHT_NULL, name->name,
- NULL, cdp->i_d.di_mode, 0, 0);
- if (error)
- return XFS_ERROR(error);
- }
-
- /*
- * Get the dquots for the inodes.
- */
- error = XFS_QM_DQATTACH(mp, dp, 0);
- if (!error)
- error = XFS_QM_DQATTACH(mp, cdp, 0);
- if (error) {
- REMOVE_DEBUG_TRACE(__LINE__);
- goto std_return;
- }
-
- tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
- cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
- /*
- * We try to get the real space reservation first,
- * allowing for directory btree deletion(s) implying
- * possible bmap insert(s). If we can't get the space
- * reservation then we use 0 instead, and avoid the bmap
- * btree insert(s) in the directory code by, if the bmap
- * insert tries to happen, instead trimming the LAST
- * block from the directory.
- */
- resblks = XFS_REMOVE_SPACE_RES(mp);
- error = xfs_trans_reserve(tp, resblks, XFS_REMOVE_LOG_RES(mp), 0,
- XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
- if (error == ENOSPC) {
- resblks = 0;
- error = xfs_trans_reserve(tp, 0, XFS_REMOVE_LOG_RES(mp), 0,
- XFS_TRANS_PERM_LOG_RES, XFS_DEFAULT_LOG_COUNT);
- }
- if (error) {
- ASSERT(error != ENOSPC);
- cancel_flags = 0;
- goto error_return;
- }
- XFS_BMAP_INIT(&free_list, &first_block);
-
- /*
- * Now lock the child directory inode and the parent directory
- * inode in the proper order. This will take care of validating
- * that the directory entry for the child directory inode has
- * not changed while we were obtaining a log reservation.
- */
- error = xfs_lock_dir_and_entry(dp, cdp);
- if (error) {
- xfs_trans_cancel(tp, cancel_flags);
- goto std_return;
- }
-
- IHOLD(dp);
- xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
-
- IHOLD(cdp);
- xfs_trans_ijoin(tp, cdp, XFS_ILOCK_EXCL);
-
- ASSERT(cdp->i_d.di_nlink >= 2);
- if (cdp->i_d.di_nlink != 2) {
- error = XFS_ERROR(ENOTEMPTY);
- goto error_return;
- }
- if (!xfs_dir_isempty(cdp)) {
- error = XFS_ERROR(ENOTEMPTY);
- goto error_return;
- }
-
- error = xfs_dir_removename(tp, dp, name, cdp->i_ino,
- &first_block, &free_list, resblks);
- if (error)
- goto error1;
-
- xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-
- /*
- * Bump the in memory generation count on the parent
- * directory so that other can know that it has changed.
- */
- dp->i_gen++;
-
- /*
- * Drop the link from cdp's "..".
- */
- error = xfs_droplink(tp, dp);
- if (error) {
- goto error1;
- }
-
- /*
- * Drop the link from dp to cdp.
- */
- error = xfs_droplink(tp, cdp);
- if (error) {
- goto error1;
- }
-
- /*
- * Drop the "." link from cdp to self.
- */
- error = xfs_droplink(tp, cdp);
- if (error) {
- goto error1;
- }
-
- /* Determine these before committing transaction */
- last_cdp_link = (cdp)->i_d.di_nlink==0;
-
- /*
- * If this is a synchronous mount, make sure that the
- * rmdir transaction goes to disk before returning to
- * the user.
- */
- if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
- xfs_trans_set_sync(tp);
- }
-
- error = xfs_bmap_finish (&tp, &free_list, &committed);
- if (error) {
- xfs_bmap_cancel(&free_list);
- xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
- XFS_TRANS_ABORT));
- goto std_return;
- }
-
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
- if (error) {
- goto std_return;
- }
-
-
- /* Fall through to std_return with error = 0 or the errno
- * from xfs_trans_commit. */
- std_return:
- if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
- (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
- dp, DM_RIGHT_NULL,
- NULL, DM_RIGHT_NULL,
- name->name, NULL, cdp->i_d.di_mode,
- error, 0);
- }
- return error;
-
- error1:
- xfs_bmap_cancel(&free_list);
- cancel_flags |= XFS_TRANS_ABORT;
- /* FALLTHROUGH */
-
- error_return:
- xfs_trans_cancel(tp, cancel_flags);
- goto std_return;
-}
-
-int
xfs_symlink(
xfs_inode_t *dp,
struct xfs_name *link_name,
@@ -2886,7 +2503,7 @@ xfs_symlink(
* Make sure that we have allocated dquot(s) on disk.
*/
error = XFS_QM_DQVOPALLOC(mp, dp,
- current_fsuid(credp), current_fsgid(credp), prid,
+ current_fsuid(), current_fsgid(), prid,
XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
if (error)
goto std_return;
@@ -3181,14 +2798,13 @@ int
xfs_reclaim(
xfs_inode_t *ip)
{
- bhv_vnode_t *vp = XFS_ITOV(ip);
xfs_itrace_entry(ip);
- ASSERT(!VN_MAPPED(vp));
+ ASSERT(!VN_MAPPED(VFS_I(ip)));
/* bad inode, get out here ASAP */
- if (VN_BAD(vp)) {
+ if (VN_BAD(VFS_I(ip))) {
xfs_ireclaim(ip);
return 0;
}
@@ -3225,7 +2841,7 @@ xfs_reclaim(
XFS_MOUNT_ILOCK(mp);
spin_lock(&ip->i_flags_lock);
__xfs_iflags_set(ip, XFS_IRECLAIMABLE);
- vn_to_inode(vp)->i_private = NULL;
+ VFS_I(ip)->i_private = NULL;
ip->i_vnode = NULL;
spin_unlock(&ip->i_flags_lock);
list_add_tail(&ip->i_reclaim, &mp->m_del_inodes);
@@ -3241,8 +2857,7 @@ xfs_finish_reclaim(
int sync_mode)
{
xfs_perag_t *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
- bhv_vnode_t *vp = XFS_ITOV_NULL(ip);
- int error;
+ struct inode *vp = VFS_I(ip);
if (vp && VN_BAD(vp))
goto reclaim;
@@ -3285,29 +2900,16 @@ xfs_finish_reclaim(
xfs_iflock(ip);
}
- if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
- if (ip->i_update_core ||
- ((ip->i_itemp != NULL) &&
- (ip->i_itemp->ili_format.ilf_fields != 0))) {
- error = xfs_iflush(ip, sync_mode);
- /*
- * If we hit an error, typically because of filesystem
- * shutdown, we don't need to let vn_reclaim to know
- * because we're gonna reclaim the inode anyway.
- */
- if (error) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- goto reclaim;
- }
- xfs_iflock(ip); /* synchronize with xfs_iflush_done */
- }
-
- ASSERT(ip->i_update_core == 0);
- ASSERT(ip->i_itemp == NULL ||
- ip->i_itemp->ili_format.ilf_fields == 0);
+ /*
+ * In the case of a forced shutdown we rely on xfs_iflush() to
+ * wait for the inode to be unpinned before returning an error.
+ */
+ if (xfs_iflush(ip, sync_mode) == 0) {
+ /* synchronize with xfs_iflush_done */
+ xfs_iflock(ip);
+ xfs_ifunlock(ip);
}
- xfs_ifunlock(ip);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
reclaim:
@@ -3418,7 +3020,7 @@ xfs_alloc_file_space(
/* Generate a DMAPI event if needed. */
if (alloc_type != 0 && offset < ip->i_size &&
- (attr_flags&ATTR_DMI) == 0 &&
+ (attr_flags & XFS_ATTR_DMI) == 0 &&
DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
xfs_off_t end_dmi_offset;
@@ -3532,7 +3134,7 @@ retry:
allocatesize_fsb -= allocated_fsb;
}
dmapi_enospc_check:
- if (error == ENOSPC && (attr_flags & ATTR_DMI) == 0 &&
+ if (error == ENOSPC && (attr_flags & XFS_ATTR_DMI) == 0 &&
DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE)) {
error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE,
ip, DM_RIGHT_NULL,
@@ -3558,6 +3160,13 @@ error1: /* Just cancel transaction */
/*
* Zero file bytes between startoff and endoff inclusive.
* The iolock is held exclusive and no blocks are buffered.
+ *
+ * This function is used by xfs_free_file_space() to zero
+ * partial blocks when the range to free is not block aligned.
+ * When unreserving space with boundaries that are not block
+ * aligned we round up the start and round down the end
+ * boundaries and then use this function to zero the parts of
+ * the blocks that got dropped during the rounding.
*/
STATIC int
xfs_zero_remaining_bytes(
@@ -3574,6 +3183,17 @@ xfs_zero_remaining_bytes(
int nimap;
int error = 0;
+ /*
+ * Avoid doing I/O beyond eof - it's not necessary
+ * since nothing can read beyond eof. The space will
+ * be zeroed when the file is extended anyway.
+ */
+ if (startoff >= ip->i_size)
+ return 0;
+
+ if (endoff > ip->i_size)
+ endoff = ip->i_size;
+
bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
XFS_IS_REALTIME_INODE(ip) ?
mp->m_rtdev_targp : mp->m_ddev_targp);
@@ -3643,7 +3263,6 @@ xfs_free_file_space(
xfs_off_t len,
int attr_flags)
{
- bhv_vnode_t *vp;
int committed;
int done;
xfs_off_t end_dmi_offset;
@@ -3663,7 +3282,6 @@ xfs_free_file_space(
xfs_trans_t *tp;
int need_iolock = 1;
- vp = XFS_ITOV(ip);
mp = ip->i_mount;
xfs_itrace_entry(ip);
@@ -3679,7 +3297,7 @@ xfs_free_file_space(
end_dmi_offset = offset + len;
endoffset_fsb = XFS_B_TO_FSBT(mp, end_dmi_offset);
- if (offset < ip->i_size && (attr_flags & ATTR_DMI) == 0 &&
+ if (offset < ip->i_size && (attr_flags & XFS_ATTR_DMI) == 0 &&
DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
if (end_dmi_offset > ip->i_size)
end_dmi_offset = ip->i_size;
@@ -3690,7 +3308,7 @@ xfs_free_file_space(
return error;
}
- if (attr_flags & ATTR_NOLOCK)
+ if (attr_flags & XFS_ATTR_NOLOCK)
need_iolock = 0;
if (need_iolock) {
xfs_ilock(ip, XFS_IOLOCK_EXCL);
@@ -3700,7 +3318,7 @@ xfs_free_file_space(
rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
ioffset = offset & ~(rounding - 1);
- if (VN_CACHED(vp) != 0) {
+ if (VN_CACHED(VFS_I(ip)) != 0) {
xfs_inval_cached_trace(ip, ioffset, -1, ioffset, -1);
error = xfs_flushinval_pages(ip, ioffset, -1, FI_REMAPF_LOCKED);
if (error)
@@ -3867,7 +3485,7 @@ xfs_change_file_space(
xfs_off_t startoffset;
xfs_off_t llen;
xfs_trans_t *tp;
- bhv_vattr_t va;
+ struct iattr iattr;
xfs_itrace_entry(ip);
@@ -3941,10 +3559,10 @@ xfs_change_file_space(
break;
}
- va.va_mask = XFS_AT_SIZE;
- va.va_size = startoffset;
+ iattr.ia_valid = ATTR_SIZE;
+ iattr.ia_size = startoffset;
- error = xfs_setattr(ip, &va, attr_flags, credp);
+ error = xfs_setattr(ip, &iattr, attr_flags, credp);
if (error)
return error;
@@ -3974,7 +3592,7 @@ xfs_change_file_space(
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
xfs_trans_ihold(tp, ip);
- if ((attr_flags & ATTR_DMI) == 0) {
+ if ((attr_flags & XFS_ATTR_DMI) == 0) {
ip->i_d.di_mode &= ~S_ISUID;
/*
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 57335ba4ce5..e932a96bec5 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -2,9 +2,9 @@
#define _XFS_VNODEOPS_H 1
struct attrlist_cursor_kern;
-struct bhv_vattr;
struct cred;
struct file;
+struct iattr;
struct inode;
struct iovec;
struct kiocb;
@@ -15,14 +15,18 @@ struct xfs_iomap;
int xfs_open(struct xfs_inode *ip);
-int xfs_setattr(struct xfs_inode *ip, struct bhv_vattr *vap, int flags,
+int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags,
struct cred *credp);
+#define XFS_ATTR_DMI 0x01 /* invocation from a DMI function */
+#define XFS_ATTR_NONBLOCK 0x02 /* return EAGAIN if operation would block */
+#define XFS_ATTR_NOLOCK 0x04 /* Don't grab any conflicting locks */
+
int xfs_readlink(struct xfs_inode *ip, char *link);
int xfs_fsync(struct xfs_inode *ip);
int xfs_release(struct xfs_inode *ip);
int xfs_inactive(struct xfs_inode *ip);
int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
- struct xfs_inode **ipp);
+ struct xfs_inode **ipp, struct xfs_name *ci_name);
int xfs_create(struct xfs_inode *dp, struct xfs_name *name, mode_t mode,
xfs_dev_t rdev, struct xfs_inode **ipp, struct cred *credp);
int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
@@ -31,8 +35,6 @@ int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
struct xfs_name *target_name);
int xfs_mkdir(struct xfs_inode *dp, struct xfs_name *dir_name,
mode_t mode, struct xfs_inode **ipp, struct cred *credp);
-int xfs_rmdir(struct xfs_inode *dp, struct xfs_name *name,
- struct xfs_inode *cdp);
int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize,
xfs_off_t *offset, filldir_t filldir);
int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,