aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/fid.c1
-rw-r--r--fs/bio.c158
-rw-r--r--fs/cramfs/inode.c1
-rw-r--r--fs/dlm/dlm_internal.h1
-rw-r--r--fs/ext2/ioctl.c57
-rw-r--r--fs/ext3/ioctl.c103
-rw-r--r--fs/ext4/ioctl.c86
-rw-r--r--fs/fat/file.c12
-rw-r--r--fs/file_table.c42
-rw-r--r--fs/hfsplus/ioctl.c40
-rw-r--r--fs/inode.c51
-rw-r--r--fs/jffs2/jffs2_fs_i.h2
-rw-r--r--fs/jffs2/jffs2_fs_sb.h2
-rw-r--r--fs/jfs/ioctl.c33
-rw-r--r--fs/locks.c1
-rw-r--r--fs/namei.c275
-rw-r--r--fs/namespace.c316
-rw-r--r--fs/ncpfs/ioctl.c54
-rw-r--r--fs/nfs/dir.c3
-rw-r--r--fs/nfsd/nfs4proc.c7
-rw-r--r--fs/nfsd/nfs4recover.c16
-rw-r--r--fs/nfsd/nfs4state.c3
-rw-r--r--fs/nfsd/vfs.c72
-rw-r--r--fs/ocfs2/ioctl.c11
-rw-r--r--fs/open.c149
-rw-r--r--fs/partitions/check.c4
-rw-r--r--fs/reiserfs/ioctl.c63
-rw-r--r--fs/reiserfs/journal.c2
-rw-r--r--fs/reiserfs/xattr.c1
-rw-r--r--fs/super.c24
-rw-r--r--fs/sysfs/dir.c1
-rw-r--r--fs/sysfs/file.c6
-rw-r--r--fs/utimes.c18
-rw-r--r--fs/xattr.c40
-rw-r--r--fs/xfs/linux-2.6/sema.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c15
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c7
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.c9
38 files changed, 1289 insertions, 399 deletions
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index dfebdbe7440..3031e3233dd 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -26,7 +26,6 @@
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/idr.h>
-#include <asm/semaphore.h>
#include <net/9p/9p.h>
#include <net/9p/client.h>
diff --git a/fs/bio.c b/fs/bio.c
index 553b5b7960a..6e0b6f66df0 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -444,22 +444,27 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
struct bio_map_data {
struct bio_vec *iovecs;
- void __user *userptr;
+ int nr_sgvecs;
+ struct sg_iovec *sgvecs;
};
-static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio)
+static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
+ struct sg_iovec *iov, int iov_count)
{
memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt);
+ memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count);
+ bmd->nr_sgvecs = iov_count;
bio->bi_private = bmd;
}
static void bio_free_map_data(struct bio_map_data *bmd)
{
kfree(bmd->iovecs);
+ kfree(bmd->sgvecs);
kfree(bmd);
}
-static struct bio_map_data *bio_alloc_map_data(int nr_segs)
+static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count)
{
struct bio_map_data *bmd = kmalloc(sizeof(*bmd), GFP_KERNEL);
@@ -467,13 +472,71 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs)
return NULL;
bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, GFP_KERNEL);
- if (bmd->iovecs)
+ if (!bmd->iovecs) {
+ kfree(bmd);
+ return NULL;
+ }
+
+ bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, GFP_KERNEL);
+ if (bmd->sgvecs)
return bmd;
+ kfree(bmd->iovecs);
kfree(bmd);
return NULL;
}
+static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count,
+ int uncopy)
+{
+ int ret = 0, i;
+ struct bio_vec *bvec;
+ int iov_idx = 0;
+ unsigned int iov_off = 0;
+ int read = bio_data_dir(bio) == READ;
+
+ __bio_for_each_segment(bvec, bio, i, 0) {
+ char *bv_addr = page_address(bvec->bv_page);
+ unsigned int bv_len = bvec->bv_len;
+
+ while (bv_len && iov_idx < iov_count) {
+ unsigned int bytes;
+ char *iov_addr;
+
+ bytes = min_t(unsigned int,
+ iov[iov_idx].iov_len - iov_off, bv_len);
+ iov_addr = iov[iov_idx].iov_base + iov_off;
+
+ if (!ret) {
+ if (!read && !uncopy)
+ ret = copy_from_user(bv_addr, iov_addr,
+ bytes);
+ if (read && uncopy)
+ ret = copy_to_user(iov_addr, bv_addr,
+ bytes);
+
+ if (ret)
+ ret = -EFAULT;
+ }
+
+ bv_len -= bytes;
+ bv_addr += bytes;
+ iov_addr += bytes;
+ iov_off += bytes;
+
+ if (iov[iov_idx].iov_len == iov_off) {
+ iov_idx++;
+ iov_off = 0;
+ }
+ }
+
+ if (uncopy)
+ __free_page(bvec->bv_page);
+ }
+
+ return ret;
+}
+
/**
* bio_uncopy_user - finish previously mapped bio
* @bio: bio being terminated
@@ -484,55 +547,56 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs)
int bio_uncopy_user(struct bio *bio)
{
struct bio_map_data *bmd = bio->bi_private;
- const int read = bio_data_dir(bio) == READ;
- struct bio_vec *bvec;
- int i, ret = 0;
+ int ret;
- __bio_for_each_segment(bvec, bio, i, 0) {
- char *addr = page_address(bvec->bv_page);
- unsigned int len = bmd->iovecs[i].bv_len;
+ ret = __bio_copy_iov(bio, bmd->sgvecs, bmd->nr_sgvecs, 1);
- if (read && !ret && copy_to_user(bmd->userptr, addr, len))
- ret = -EFAULT;
-
- __free_page(bvec->bv_page);
- bmd->userptr += len;
- }
bio_free_map_data(bmd);
bio_put(bio);
return ret;
}
/**
- * bio_copy_user - copy user data to bio
+ * bio_copy_user_iov - copy user data to bio
* @q: destination block queue
- * @uaddr: start of user address
- * @len: length in bytes
+ * @iov: the iovec.
+ * @iov_count: number of elements in the iovec
* @write_to_vm: bool indicating writing to pages or not
*
* Prepares and returns a bio for indirect user io, bouncing data
* to/from kernel pages as necessary. Must be paired with
* call bio_uncopy_user() on io completion.
*/
-struct bio *bio_copy_user(struct request_queue *q, unsigned long uaddr,
- unsigned int len, int write_to_vm)
+struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov,
+ int iov_count, int write_to_vm)
{
- unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
- unsigned long start = uaddr >> PAGE_SHIFT;
struct bio_map_data *bmd;
struct bio_vec *bvec;
struct page *page;
struct bio *bio;
int i, ret;
+ int nr_pages = 0;
+ unsigned int len = 0;
- bmd = bio_alloc_map_data(end - start);
+ for (i = 0; i < iov_count; i++) {
+ unsigned long uaddr;
+ unsigned long end;
+ unsigned long start;
+
+ uaddr = (unsigned long)iov[i].iov_base;
+ end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ start = uaddr >> PAGE_SHIFT;
+
+ nr_pages += end - start;
+ len += iov[i].iov_len;
+ }
+
+ bmd = bio_alloc_map_data(nr_pages, iov_count);
if (!bmd)
return ERR_PTR(-ENOMEM);
- bmd->userptr = (void __user *) uaddr;
-
ret = -ENOMEM;
- bio = bio_alloc(GFP_KERNEL, end - start);
+ bio = bio_alloc(GFP_KERNEL, nr_pages);
if (!bio)
goto out_bmd;
@@ -564,22 +628,12 @@ struct bio *bio_copy_user(struct request_queue *q, unsigned long uaddr,
* success
*/
if (!write_to_vm) {
- char __user *p = (char __user *) uaddr;
-
- /*
- * for a write, copy in data to kernel pages
- */
- ret = -EFAULT;
- bio_for_each_segment(bvec, bio, i) {
- char *addr = page_address(bvec->bv_page);
-
- if (copy_from_user(addr, p, bvec->bv_len))
- goto cleanup;
- p += bvec->bv_len;
- }
+ ret = __bio_copy_iov(bio, iov, iov_count, 0);
+ if (ret)
+ goto cleanup;
}
- bio_set_map_data(bmd, bio);
+ bio_set_map_data(bmd, bio, iov, iov_count);
return bio;
cleanup:
bio_for_each_segment(bvec, bio, i)
@@ -591,6 +645,28 @@ out_bmd:
return ERR_PTR(ret);
}
+/**
+ * bio_copy_user - copy user data to bio
+ * @q: destination block queue
+ * @uaddr: start of user address
+ * @len: length in bytes
+ * @write_to_vm: bool indicating writing to pages or not
+ *
+ * Prepares and returns a bio for indirect user io, bouncing data
+ * to/from kernel pages as necessary. Must be paired with
+ * call bio_uncopy_user() on io completion.
+ */
+struct bio *bio_copy_user(struct request_queue *q, unsigned long uaddr,
+ unsigned int len, int write_to_vm)
+{
+ struct sg_iovec iov;
+
+ iov.iov_base = (void __user *)uaddr;
+ iov.iov_len = len;
+
+ return bio_copy_user_iov(q, &iov, 1, write_to_vm);
+}
+
static struct bio *__bio_map_user_iov(struct request_queue *q,
struct block_device *bdev,
struct sg_iovec *iov, int iov_count,
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 350680fd7da..0c3b618c15b 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -23,7 +23,6 @@
#include <linux/buffer_head.h>
#include <linux/vfs.h>
#include <linux/mutex.h>
-#include <asm/semaphore.h>
#include <asm/uaccess.h>
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index d30ea8b433a..7a8824f475f 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -37,7 +37,6 @@
#include <linux/jhash.h>
#include <linux/miscdevice.h>
#include <linux/mutex.h>
-#include <asm/semaphore.h>
#include <asm/uaccess.h>
#include <linux/dlm.h>
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index b8ea11fee5c..de876fa793e 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -12,6 +12,7 @@
#include <linux/time.h>
#include <linux/sched.h>
#include <linux/compat.h>
+#include <linux/mount.h>
#include <linux/smp_lock.h>
#include <asm/current.h>
#include <asm/uaccess.h>
@@ -23,6 +24,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
struct ext2_inode_info *ei = EXT2_I(inode);
unsigned int flags;
unsigned short rsv_window_size;
+ int ret;
ext2_debug ("cmd = %u, arg = %lu\n", cmd, arg);
@@ -34,14 +36,19 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
case EXT2_IOC_SETFLAGS: {
unsigned int oldflags;
- if (IS_RDONLY(inode))
- return -EROFS;
+ ret = mnt_want_write(filp->f_path.mnt);
+ if (ret)
+ return ret;
- if (!is_owner_or_cap(inode))
- return -EACCES;
+ if (!is_owner_or_cap(inode)) {
+ ret = -EACCES;
+ goto setflags_out;
+ }
- if (get_user(flags, (int __user *) arg))
- return -EFAULT;
+ if (get_user(flags, (int __user *) arg)) {
+ ret = -EFAULT;
+ goto setflags_out;
+ }
if (!S_ISDIR(inode->i_mode))
flags &= ~EXT2_DIRSYNC_FL;
@@ -50,7 +57,8 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
/* Is it quota file? Do not allow user to mess with it */
if (IS_NOQUOTA(inode)) {
mutex_unlock(&inode->i_mutex);
- return -EPERM;
+ ret = -EPERM;
+ goto setflags_out;
}
oldflags = ei->i_flags;
@@ -63,7 +71,8 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if ((flags ^ oldflags) & (EXT2_APPEND_FL | EXT2_IMMUTABLE_FL)) {
if (!capable(CAP_LINUX_IMMUTABLE)) {
mutex_unlock(&inode->i_mutex);
- return -EPERM;
+ ret = -EPERM;
+ goto setflags_out;
}
}
@@ -75,20 +84,26 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
ext2_set_inode_flags(inode);
inode->i_ctime = CURRENT_TIME_SEC;
mark_inode_dirty(inode);
- return 0;
+setflags_out:
+ mnt_drop_write(filp->f_path.mnt);
+ return ret;
}
case EXT2_IOC_GETVERSION:
return put_user(inode->i_generation, (int __user *) arg);
case EXT2_IOC_SETVERSION:
if (!is_owner_or_cap(inode))
return -EPERM;
- if (IS_RDONLY(inode))
- return -EROFS;
- if (get_user(inode->i_generation, (int __user *) arg))
- return -EFAULT;
- inode->i_ctime = CURRENT_TIME_SEC;
- mark_inode_dirty(inode);
- return 0;
+ ret = mnt_want_write(filp->f_path.mnt);
+ if (ret)
+ return ret;
+ if (get_user(inode->i_generation, (int __user *) arg)) {
+ ret = -EFAULT;
+ } else {
+ inode->i_ctime = CURRENT_TIME_SEC;
+ mark_inode_dirty(inode);
+ }
+ mnt_drop_write(filp->f_path.mnt);
+ return ret;
case EXT2_IOC_GETRSVSZ:
if (test_opt(inode->i_sb, RESERVATION)
&& S_ISREG(inode->i_mode)
@@ -102,15 +117,16 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
return -ENOTTY;
- if (IS_RDONLY(inode))
- return -EROFS;
-
- if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+ if (!is_owner_or_cap(inode))
return -EACCES;
if (get_user(rsv_window_size, (int __user *)arg))
return -EFAULT;
+ ret = mnt_want_write(filp->f_path.mnt);
+ if (ret)
+ return ret;
+
if (rsv_window_size > EXT2_MAX_RESERVE_BLOCKS)
rsv_window_size = EXT2_MAX_RESERVE_BLOCKS;
@@ -131,6 +147,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
rsv->rsv_goal_size = rsv_window_size;
}
mutex_unlock(&ei->truncate_mutex);
+ mnt_drop_write(filp->f_path.mnt);
return 0;
}
default:
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 023a070f55f..0d0c7015164 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -12,6 +12,7 @@
#include <linux/capability.h>
#include <linux/ext3_fs.h>
#include <linux/ext3_jbd.h>
+#include <linux/mount.h>
#include <linux/time.h>
#include <linux/compat.h>
#include <linux/smp_lock.h>
@@ -38,14 +39,19 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
unsigned int oldflags;
unsigned int jflag;
- if (IS_RDONLY(inode))
- return -EROFS;
+ err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+ return err;
- if (!is_owner_or_cap(inode))
- return -EACCES;
+ if (!is_owner_or_cap(inode)) {
+ err = -EACCES;
+ goto flags_out;
+ }
- if (get_user(flags, (int __user *) arg))
- return -EFAULT;
+ if (get_user(flags, (int __user *) arg)) {
+ err = -EFAULT;
+ goto flags_out;
+ }
if (!S_ISDIR(inode->i_mode))
flags &= ~EXT3_DIRSYNC_FL;
@@ -54,7 +60,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
/* Is it quota file? Do not allow user to mess with it */
if (IS_NOQUOTA(inode)) {
mutex_unlock(&inode->i_mutex);
- return -EPERM;
+ err = -EPERM;
+ goto flags_out;
}
oldflags = ei->i_flags;
@@ -70,7 +77,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) {
if (!capable(CAP_LINUX_IMMUTABLE)) {
mutex_unlock(&inode->i_mutex);
- return -EPERM;
+ err = -EPERM;
+ goto flags_out;
}
}
@@ -81,7 +89,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) {
if (!capable(CAP_SYS_RESOURCE)) {
mutex_unlock(&inode->i_mutex);
- return -EPERM;
+ err = -EPERM;
+ goto flags_out;
}
}
@@ -89,7 +98,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
handle = ext3_journal_start(inode, 1);
if (IS_ERR(handle)) {
mutex_unlock(&inode->i_mutex);
- return PTR_ERR(handle);
+ err = PTR_ERR(handle);
+ goto flags_out;
}
if (IS_SYNC(inode))
handle->h_sync = 1;
@@ -115,6 +125,8 @@ flags_err:
if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL))
err = ext3_change_inode_journal_flag(inode, jflag);
mutex_unlock(&inode->i_mutex);
+flags_out:
+ mnt_drop_write(filp->f_path.mnt);
return err;
}
case EXT3_IOC_GETVERSION:
@@ -129,14 +141,18 @@ flags_err:
if (!is_owner_or_cap(inode))
return -EPERM;
- if (IS_RDONLY(inode))
- return -EROFS;
- if (get_user(generation, (int __user *) arg))
- return -EFAULT;
-
+ err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+ return err;
+ if (get_user(generation, (int __user *) arg)) {
+ err = -EFAULT;
+ goto setversion_out;
+ }
handle = ext3_journal_start(inode, 1);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto setversion_out;
+ }
err = ext3_reserve_inode_write(handle, inode, &iloc);
if (err == 0) {
inode->i_ctime = CURRENT_TIME_SEC;
@@ -144,6 +160,8 @@ flags_err:
err = ext3_mark_iloc_dirty(handle, inode, &iloc);
}
ext3_journal_stop(handle);
+setversion_out:
+ mnt_drop_write(filp->f_path.mnt);
return err;
}
#ifdef CONFIG_JBD_DEBUG
@@ -179,18 +197,24 @@ flags_err:
}
return -ENOTTY;
case EXT3_IOC_SETRSVSZ: {
+ int err;
if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
return -ENOTTY;
- if (IS_RDONLY(inode))
- return -EROFS;
+ err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+ return err;
- if (!is_owner_or_cap(inode))
- return -EACCES;
+ if (!is_owner_or_cap(inode)) {
+ err = -EACCES;
+ goto setrsvsz_out;
+ }
- if (get_user(rsv_window_size, (int __user *)arg))
- return -EFAULT;
+ if (get_user(rsv_window_size, (int __user *)arg)) {
+ err = -EFAULT;
+ goto setrsvsz_out;
+ }
if (rsv_window_size > EXT3_MAX_RESERVE_BLOCKS)
rsv_window_size = EXT3_MAX_RESERVE_BLOCKS;
@@ -208,7 +232,9 @@ flags_err:
rsv->rsv_goal_size = rsv_window_size;
}
mutex_unlock(&ei->truncate_mutex);
- return 0;
+setrsvsz_out:
+ mnt_drop_write(filp->f_path.mnt);
+ return err;
}
case EXT3_IOC_GROUP_EXTEND: {
ext3_fsblk_t n_blocks_count;
@@ -218,17 +244,20 @@ flags_err:
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
- if (IS_RDONLY(inode))
- return -EROFS;
-
- if (get_user(n_blocks_count, (__u32 __user *)arg))
- return -EFAULT;
+ err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+ return err;
+ if (get_user(n_blocks_count, (__u32 __user *)arg)) {
+ err = -EFAULT;
+ goto group_extend_out;
+ }
err = ext3_group_extend(sb, EXT3_SB(sb)->s_es, n_blocks_count);
journal_lock_updates(EXT3_SB(sb)->s_journal);
journal_flush(EXT3_SB(sb)->s_journal);
journal_unlock_updates(EXT3_SB(sb)->s_journal);
-
+group_extend_out:
+ mnt_drop_write(filp->f_path.mnt);
return err;
}
case EXT3_IOC_GROUP_ADD: {
@@ -239,18 +268,22 @@ flags_err:
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
- if (IS_RDONLY(inode))
- return -EROFS;
+ err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+ return err;
if (copy_from_user(&input, (struct ext3_new_group_input __user *)arg,
- sizeof(input)))
- return -EFAULT;
+ sizeof(input))) {
+ err = -EFAULT;
+ goto group_add_out;
+ }
err = ext3_group_add(sb, &input);
journal_lock_updates(EXT3_SB(sb)->s_journal);
journal_flush(EXT3_SB(sb)->s_journal);
journal_unlock_updates(EXT3_SB(sb)->s_journal);
-
+group_add_out:
+ mnt_drop_write(filp->f_path.mnt);
return err;
}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 2ed7c37f897..25b13ede808 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -15,6 +15,7 @@
#include <linux/time.h>
#include <linux/compat.h>
#include <linux/smp_lock.h>
+#include <linux/mount.h>
#include <asm/uaccess.h>
int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
@@ -38,24 +39,25 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
unsigned int oldflags;
unsigned int jflag;
- if (IS_RDONLY(inode))
- return -EROFS;
-
if (!is_owner_or_cap(inode))
return -EACCES;
if (get_user(flags, (int __user *) arg))
return -EFAULT;
+ err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+ return err;
+
if (!S_ISDIR(inode->i_mode))
flags &= ~EXT4_DIRSYNC_FL;
+ err = -EPERM;
mutex_lock(&inode->i_mutex);
/* Is it quota file? Do not allow user to mess with it */
- if (IS_NOQUOTA(inode)) {
- mutex_unlock(&inode->i_mutex);
- return -EPERM;
- }
+ if (IS_NOQUOTA(inode))
+ goto flags_out;
+
oldflags = ei->i_flags;
/* The JOURNAL_DATA flag is modifiable only by root */
@@ -68,10 +70,8 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
* This test looks nicer. Thanks to Pauline Middelink
*/
if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) {
- if (!capable(CAP_LINUX_IMMUTABLE)) {
- mutex_unlock(&inode->i_mutex);
- return -EPERM;
- }
+ if (!capable(CAP_LINUX_IMMUTABLE))
+ goto flags_out;
}
/*
@@ -79,17 +79,14 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
* the relevant capability.
*/
if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
- if (!capable(CAP_SYS_RESOURCE)) {
- mutex_unlock(&inode->i_mutex);
- return -EPERM;
- }
+ if (!capable(CAP_SYS_RESOURCE))
+ goto flags_out;
}
-
handle = ext4_journal_start(inode, 1);
if (IS_ERR(handle)) {
- mutex_unlock(&inode->i_mutex);
- return PTR_ERR(handle);
+ err = PTR_ERR(handle);
+ goto flags_out;
}
if (IS_SYNC(inode))
handle->h_sync = 1;
@@ -107,14 +104,14 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
flags_err:
ext4_journal_stop(handle);
- if (err) {
- mutex_unlock(&inode->i_mutex);
- return err;
- }
+ if (err)
+ goto flags_out;
if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL))
err = ext4_change_inode_journal_flag(inode, jflag);
+flags_out:
mutex_unlock(&inode->i_mutex);
+ mnt_drop_write(filp->f_path.mnt);
return err;
}
case EXT4_IOC_GETVERSION:
@@ -129,14 +126,20 @@ flags_err:
if (!is_owner_or_cap(inode))
return -EPERM;
- if (IS_RDONLY(inode))
- return -EROFS;
- if (get_user(generation, (int __user *) arg))
- return -EFAULT;
+
+ err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+ return err;
+ if (get_user(generation, (int __user *) arg)) {
+ err = -EFAULT;
+ goto setversion_out;
+ }
handle = ext4_journal_start(inode, 1);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto setversion_out;
+ }
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (err == 0) {
inode->i_ctime = ext4_current_time(inode);
@@ -144,6 +147,8 @@ flags_err:
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
}
ext4_journal_stop(handle);
+setversion_out:
+ mnt_drop_write(filp->f_path.mnt);
return err;
}
#ifdef CONFIG_JBD2_DEBUG
@@ -179,19 +184,21 @@ flags_err:
}
return -ENOTTY;
case EXT4_IOC_SETRSVSZ: {
+ int err;
if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
return -ENOTTY;
- if (IS_RDONLY(inode))
- return -EROFS;
-
if (!is_owner_or_cap(inode))
return -EACCES;
if (get_user(rsv_window_size, (int __user *)arg))
return -EFAULT;
+ err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+ return err;
+
if (rsv_window_size > EXT4_MAX_RESERVE_BLOCKS)
rsv_window_size = EXT4_MAX_RESERVE_BLOCKS;
@@ -208,6 +215,7 @@ flags_err:
rsv->rsv_goal_size = rsv_window_size;
}
up_write(&ei->i_data_sem);
+ mnt_drop_write(filp->f_path.mnt);
return 0;
}
case EXT4_IOC_GROUP_EXTEND: {
@@ -218,16 +226,18 @@ flags_err:
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
- if (IS_RDONLY(inode))
- return -EROFS;
-
if (get_user(n_blocks_count, (__u32 __user *)arg))
return -EFAULT;
+ err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+ return err;
+
err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
jbd2_journal_flush(EXT4_SB(sb)->s_journal);
jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+ mnt_drop_write(filp->f_path.mnt);
return err;
}
@@ -239,17 +249,19 @@ flags_err:
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
- if (IS_RDONLY(inode))
- return -EROFS;
-
if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg,
sizeof(input)))
return -EFAULT;
+ err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+ return err;
+
err = ext4_group_add(sb, &input);
jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
jbd2_journal_flush(EXT4_SB(sb)->s_journal);
jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+ mnt_drop_write(filp->f_path.mnt);
return err;
}
diff --git a/fs/fat/file.c b/fs/fat/file.c
index c614175876e..2a3bed96704 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -8,6 +8,7 @@
#include <linux/capability.h>
#include <linux/module.h>
+#include <linux/mount.h>
#include <linux/time.h>
#include <linux/msdos_fs.h>
#include <linux/smp_lock.h>
@@ -46,10 +47,9 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
mutex_lock(&inode->i_mutex);
- if (IS_RDONLY(inode)) {
- err = -EROFS;
- goto up;
- }
+ err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+ goto up_no_drop_write;
/*
* ATTR_VOLUME and ATTR_DIR cannot be changed; this also
@@ -105,7 +105,9 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
MSDOS_I(inode)->i_attrs = attr & ATTR_UNUSED;
mark_inode_dirty(inode);
- up:
+up:
+ mnt_drop_write(filp->f_path.mnt);
+up_no_drop_write:
mutex_unlock(&inode->i_mutex);
return err;
}
diff --git a/fs/file_table.c b/fs/file_table.c
index 986ff4ed0a7..7a0a9b87225 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -42,6 +42,7 @@ static inline void file_free_rcu(struct rcu_head *head)
static inline void file_free(struct file *f)
{
percpu_counter_dec(&nr_files);
+ file_check_state(f);
call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
}
@@ -199,6 +200,18 @@ int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry,
file->f_mapping = dentry->d_inode->i_mapping;
file->f_mode = mode;
file->f_op = fop;
+
+ /*
+ * These mounts don't really matter in practice
+ * for r/o bind mounts. They aren't userspace-
+ * visible. We do this for consistency, and so
+ * that we can do debugging checks at __fput()
+ */
+ if ((mode & FMODE_WRITE) && !special_file(dentry->d_inode->i_mode)) {
+ file_take_write(file);
+ error = mnt_want_write(mnt);
+ WARN_ON(error);
+ }
return error;
}
EXPORT_SYMBOL(init_file);
@@ -211,6 +224,31 @@ void fput(struct file *file)
EXPORT_SYMBOL(fput);
+/**
+ * drop_file_write_access - give up ability to write to a file
+ * @file: the file to which we will stop writing
+ *
+ * This is a central place which will give up the ability
+ * to write to @file, along with access to write through
+ * its vfsmount.
+ */
+void drop_file_write_access(struct file *file)
+{
+ struct vfsmount *mnt = file->f_path.mnt;
+ struct dentry *dentry = file->f_path.dentry;
+ struct inode *inode = dentry->d_inode;
+
+ put_write_access(inode);
+
+ if (special_file(inode->i_mode))
+ return;
+ if (file_check_writeable(file) != 0)
+ return;
+ mnt_drop_write(mnt);
+ file_release_write(file);
+}
+EXPORT_SYMBOL_GPL(drop_file_write_access);
+
/* __fput is called from task context when aio completion releases the last
* last use of a struct file *. Do not use otherwise.
*/
@@ -236,10 +274,10 @@ void __fput(struct file *file)
if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL))
cdev_put(inode->i_cdev);
fops_put(file->f_op);
- if (file->f_mode & FMODE_WRITE)
- put_write_access(inode);
put_pid(file->f_owner.pid);
file_kill(file);
+ if (file->f_mode & FMODE_WRITE)
+ drop_file_write_access(file);
file->f_path.dentry = NULL;
file->f_path.mnt = NULL;
file_free(file);
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index b60c0affbec..f457d2ca51a 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -14,6 +14,7 @@
#include <linux/capability.h>
#include <linux/fs.h>
+#include <linux/mount.h>
#include <linux/sched.h>
#include <linux/xattr.h>
#include <asm/uaccess.h>
@@ -35,25 +36,32 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
flags |= FS_NODUMP_FL; /* EXT2_NODUMP_FL */
return put_user(flags, (int __user *)arg);
case HFSPLUS_IOC_EXT2_SETFLAGS: {
- if (IS_RDONLY(inode))
- return -EROFS;
-
- if (!is_owner_or_cap(inode))
- return -EACCES;
-
- if (get_user(flags, (int __user *)arg))
- return -EFAULT;
-
+ int err = 0;
+ err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+ return err;
+
+ if (!is_owner_or_cap(inode)) {
+ err = -EACCES;
+ goto setflags_out;
+ }
+ if (get_user(flags, (int __user *)arg)) {
+ err = -EFAULT;
+ goto setflags_out;
+ }
if (flags & (FS_IMMUTABLE_FL|FS_APPEND_FL) ||
HFSPLUS_I(inode).rootflags & (HFSPLUS_FLG_IMMUTABLE|HFSPLUS_FLG_APPEND)) {
- if (!capable(CAP_LINUX_IMMUTABLE))
- return -EPERM;
+ if (!capable(CAP_LINUX_IMMUTABLE)) {
+ err = -EPERM;
+ goto setflags_out;
+ }
}
/* don't silently ignore unsupported ext2 flags */
- if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL))
- return -EOPNOTSUPP;
-
+ if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) {
+ err = -EOPNOTSUPP;
+ goto setflags_out;
+ }
if (flags & FS_IMMUTABLE_FL) { /* EXT2_IMMUTABLE_FL */
inode->i_flags |= S_IMMUTABLE;
HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_IMMUTABLE;
@@ -75,7 +83,9 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
inode->i_ctime = CURRENT_TIME_SEC;
mark_inode_dirty(inode);
- return 0;
+setflags_out:
+ mnt_drop_write(filp->f_path.mnt);
+ return err;
}
default:
return -ENOTTY;
diff --git a/fs/inode.c b/fs/inode.c
index 53245ffcf93..27ee1af50d0 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1199,42 +1199,37 @@ void touch_atime(struct vfsmount *mnt, struct dentry *dentry)
struct inode *inode = dentry->d_inode;
struct timespec now;
- if (inode->i_flags & S_NOATIME)
+ if (mnt_want_write(mnt))
return;
+ if (inode->i_flags & S_NOATIME)
+ goto out;
if (IS_NOATIME(inode))
- return;
+ goto out;
if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
- return;
+ goto out;
- /*
- * We may have a NULL vfsmount when coming from NFSD
- */
- if (mnt) {
- if (mnt->mnt_flags & MNT_NOATIME)
- return;
- if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
- return;
-
- if (mnt->mnt_flags & MNT_RELATIME) {
- /*
- * With relative atime, only update atime if the
- * previous atime is earlier than either the ctime or
- * mtime.
- */
- if (timespec_compare(&inode->i_mtime,
- &inode->i_atime) < 0 &&
- timespec_compare(&inode->i_ctime,
- &inode->i_atime) < 0)
- return;
- }
+ if (mnt->mnt_flags & MNT_NOATIME)
+ goto out;
+ if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
+ goto out;
+ if (mnt->mnt_flags & MNT_RELATIME) {
+ /*
+ * With relative atime, only update atime if the previous
+ * atime is earlier than either the ctime or mtime.
+ */
+ if (timespec_compare(&inode->i_mtime, &inode->i_atime) < 0 &&
+ timespec_compare(&inode->i_ctime, &inode->i_atime) < 0)
+ goto out;
}
now = current_fs_time(inode->i_sb);
if (timespec_equal(&inode->i_atime, &now))
- return;
+ goto out;
inode->i_atime = now;
mark_inode_dirty_sync(inode);
+out:
+ mnt_drop_write(mnt);
}
EXPORT_SYMBOL(touch_atime);
@@ -1255,10 +1250,13 @@ void file_update_time(struct file *file)
struct inode *inode = file->f_path.dentry->d_inode;
struct timespec now;
int sync_it = 0;
+ int err;
if (IS_NOCMTIME(inode))
return;
- if (IS_RDONLY(inode))
+
+ err = mnt_want_write(file->f_path.mnt);
+ if (err)
return;
now = current_fs_time(inode->i_sb);
@@ -1279,6 +1277,7 @@ void file_update_time(struct file *file)
if (sync_it)
mark_inode_dirty_sync(inode);
+ mnt_drop_write(file->f_path.mnt);
}
EXPORT_SYMBOL(file_update_time);
diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h
index 0b78fdc9773..a841f4973a7 100644
--- a/fs/jffs2/jffs2_fs_i.h
+++ b/fs/jffs2/jffs2_fs_i.h
@@ -15,7 +15,7 @@
#include <linux/version.h>
#include <linux/rbtree.h>
#include <linux/posix_acl.h>
-#include <asm/semaphore.h>
+#include <linux/semaphore.h>
struct jffs2_inode_info {
/* We need an internal mutex similar to inode->i_mutex.
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 3a2197f3c81..18fca2b9e53 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -16,7 +16,7 @@
#include <linux/spinlock.h>
#include <linux/workqueue.h>
#include <linux/completion.h>
-#include <asm/semaphore.h>
+#include <linux/semaphore.h>
#include <linux/timer.h>
#include <linux/wait.h>
#include <linux/list.h>
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index a1f8e375ad2..afe222bf300 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -8,6 +8,7 @@
#include <linux/fs.h>
#include <linux/ctype.h>
#include <linux/capability.h>
+#include <linux/mount.h>
#include <linux/time.h>
#include <linux/sched.h>
#include <asm/current.h>
@@ -65,23 +66,30 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return put_user(flags, (int __user *) arg);
case JFS_IOC_SETFLAGS: {
unsigned int oldflags;
+ int err;
- if (IS_RDONLY(inode))
- return -EROFS;
+ err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+ return err;
- if (!is_owner_or_cap(inode))
- return -EACCES;
-
- if (get_user(flags, (int __user *) arg))
- return -EFAULT;
+ if (!is_owner_or_cap(inode)) {
+ err = -EACCES;
+ goto setflags_out;
+ }
+ if (get_user(flags, (int __user *) arg)) {
+ err = -EFAULT;
+ goto setflags_out;
+ }
flags = jfs_map_ext2(flags, 1);
if (!S_ISDIR(inode->i_mode))
flags &= ~JFS_DIRSYNC_FL;
/* Is it quota file? Do not allow user to mess with it */
- if (IS_NOQUOTA(inode))
- return -EPERM;
+ if (IS_NOQUOTA(inode)) {
+ err = -EPERM;
+ goto setflags_out;
+ }
/* Lock against other parallel changes of flags */
mutex_lock(&inode->i_mutex);
@@ -98,7 +106,8 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
(JFS_APPEND_FL | JFS_IMMUTABLE_FL))) {
if (!capable(CAP_LINUX_IMMUTABLE)) {
mutex_unlock(&inode->i_mutex);
- return -EPERM;
+ err = -EPERM;
+ goto setflags_out;
}
}
@@ -110,7 +119,9 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
mutex_unlock(&inode->i_mutex);
inode->i_ctime = CURRENT_TIME_SEC;
mark_inode_dirty(inode);
- return 0;
+setflags_out:
+ mnt_drop_write(filp->f_path.mnt);
+ return err;
}
default:
return -ENOTTY;
diff --git a/fs/locks.c b/fs/locks.c
index 43c0af21a0c..592faadbcec 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -127,7 +127,6 @@
#include <linux/rcupdate.h>
#include <linux/pid_namespace.h>
-#include <asm/semaphore.h>
#include <asm/uaccess.h>
#define IS_POSIX(fl) (fl->fl_flags & FL_POSIX)
diff --git a/fs/namei.c b/fs/namei.c
index 8cf9bb9c2fc..e179f71bfcb 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1623,8 +1623,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
return -EACCES;
flag &= ~O_TRUNC;
- } else if (IS_RDONLY(inode) && (acc_mode & MAY_WRITE))
- return -EROFS;
+ }
error = vfs_permission(nd, acc_mode);
if (error)
@@ -1677,7 +1676,12 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
return 0;
}
-static int open_namei_create(struct nameidata *nd, struct path *path,
+/*
+ * Be careful about ever adding any more callers of this
+ * function. Its flags must be in the namei format, not
+ * what get passed to sys_open().
+ */
+static int __open_namei_create(struct nameidata *nd, struct path *path,
int flag, int mode)
{
int error;
@@ -1696,26 +1700,56 @@ static int open_namei_create(struct nameidata *nd, struct path *path,
}
/*
- * open_namei()
+ * Note that while the flag value (low two bits) for sys_open means:
+ * 00 - read-only
+ * 01 - write-only
+ * 10 - read-write
+ * 11 - special
+ * it is changed into
+ * 00 - no permissions needed
+ * 01 - read-permission
+ * 10 - write-permission
+ * 11 - read-write
+ * for the internal routines (ie open_namei()/follow_link() etc)
+ * This is more logical, and also allows the 00 "no perm needed"
+ * to be used for symlinks (where the permissions are checked
+ * later).
*
- * namei for open - this is in fact almost the whole open-routine.
- *
- * Note that the low bits of "flag" aren't the same as in the open
- * system call - they are 00 - no permissions needed
- * 01 - read permission needed
- * 10 - write permission needed
- * 11 - read/write permissions needed
- * which is a lot more logical, and also allows the "no perm" needed
- * for symlinks (where the permissions are checked later).
- * SMP-safe
+*/
+static inline int open_to_namei_flags(int flag)
+{
+ if ((flag+1) & O_ACCMODE)
+ flag++;
+ return flag;
+}
+
+static int open_will_write_to_fs(int flag, struct inode *inode)
+{
+ /*
+ * We'll never write to the fs underlying
+ * a device file.
+ */
+ if (special_file(inode->i_mode))
+ return 0;
+ return (flag & O_TRUNC);
+}
+
+/*
+ * Note that the low bits of the passed in "open_flag"
+ * are not the same as in the local variable "flag". See
+ * open_to_namei_flags() for more details.
*/
-int open_namei(int dfd, const char *pathname, int flag,
- int mode, struct nameidata *nd)
+struct file *do_filp_open(int dfd, const char *pathname,
+ int open_flag, int mode)
{
+ struct file *filp;
+ struct nameidata nd;
int acc_mode, error;
struct path path;
struct dentry *dir;
int count = 0;
+ int will_write;
+ int flag = open_to_namei_flags(open_flag);
acc_mode = ACC_MODE(flag);
@@ -1733,18 +1767,19 @@ int open_namei(int dfd, const char *pathname, int flag,
*/
if (!(flag & O_CREAT)) {
error = path_lookup_open(dfd, pathname, lookup_flags(flag),
- nd, flag);
+ &nd, flag);
if (error)
- return error;
+ return ERR_PTR(error);
goto ok;
}
/*
* Create - we need to know the parent.
*/
- error = path_lookup_create(dfd,pathname,LOOKUP_PARENT,nd,flag,mode);
+ error = path_lookup_create(dfd, pathname, LOOKUP_PARENT,
+ &nd, flag, mode);
if (error)
- return error;
+ return ERR_PTR(error);
/*
* We have the parent and last component. First of all, check
@@ -1752,14 +1787,14 @@ int open_namei(int dfd, const char *pathname, int flag,
* will not do.
*/
error = -EISDIR;
- if (nd->last_type != LAST_NORM || nd->last.name[nd->last.len])
+ if (nd.last_type != LAST_NORM || nd.last.name[nd.last.len])
goto exit;
- dir = nd->path.dentry;
- nd->flags &= ~LOOKUP_PARENT;
+ dir = nd.path.dentry;
+ nd.flags &= ~LOOKUP_PARENT;
mutex_lock(&dir->d_inode->i_mutex);
- path.dentry = lookup_hash(nd);
- path.mnt = nd->path.mnt;
+ path.dentry = lookup_hash(&nd);
+ path.mnt = nd.path.mnt;
do_last:
error = PTR_ERR(path.dentry);
@@ -1768,18 +1803,31 @@ do_last:
goto exit;
}
- if (IS_ERR(nd->intent.open.file)) {
- mutex_unlock(&dir->d_inode->i_mutex);
- error = PTR_ERR(nd->intent.open.file);
- goto exit_dput;
+ if (IS_ERR(nd.intent.open.file)) {
+ error = PTR_ERR(nd.intent.open.file);
+ goto exit_mutex_unlock;
}
/* Negative dentry, just create the file */
if (!path.dentry->d_inode) {
- error = open_namei_create(nd, &path, flag, mode);
+ /*
+ * This write is needed to ensure that a
+ * ro->rw transition does not occur between
+ * the time when the file is created and when
+ * a permanent write count is taken through
+ * the 'struct file' in nameidata_to_filp().
+ */
+ error = mnt_want_write(nd.path.mnt);
if (error)
+ goto exit_mutex_unlock;
+ error = __open_namei_create(&nd, &path, flag, mode);
+ if (error) {
+ mnt_drop_write(nd.path.mnt);
goto exit;
- return 0;
+ }
+ filp = nameidata_to_filp(&nd, open_flag);
+ mnt_drop_write(nd.path.mnt);
+ return filp;
}
/*
@@ -1804,23 +1852,52 @@ do_last:
if (path.dentry->d_inode->i_op && path.dentry->d_inode->i_op->follow_link)
goto do_link;
- path_to_nameidata(&path, nd);
+ path_to_nameidata(&path, &nd);
error = -EISDIR;
if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode))
goto exit;
ok:
- error = may_open(nd, acc_mode, flag);
- if (error)
+ /*
+ * Consider:
+ * 1. may_open() truncates a file
+ * 2. a rw->ro mount transition occurs
+ * 3. nameidata_to_filp() fails due to
+ * the ro mount.
+ * That would be inconsistent, and should
+ * be avoided. Taking this mnt write here
+ * ensures that (2) can not occur.
+ */
+ will_write = open_will_write_to_fs(flag, nd.path.dentry->d_inode);
+ if (will_write) {
+ error = mnt_want_write(nd.path.mnt);
+ if (error)
+ goto exit;
+ }
+ error = may_open(&nd, acc_mode, flag);
+ if (error) {
+ if (will_write)
+ mnt_drop_write(nd.path.mnt);
goto exit;
- return 0;
+ }
+ filp = nameidata_to_filp(&nd, open_flag);
+ /*
+ * It is now safe to drop the mnt write
+ * because the filp has had a write taken
+ * on its behalf.
+ */
+ if (will_write)
+ mnt_drop_write(nd.path.mnt);
+ return filp;
+exit_mutex_unlock:
+ mutex_unlock(&dir->d_inode->i_mutex);
exit_dput:
- path_put_conditional(&path, nd);
+ path_put_conditional(&path, &nd);
exit:
- if (!IS_ERR(nd->intent.open.file))
- release_open_intent(nd);
- path_put(&nd->path);
- return error;
+ if (!IS_ERR(nd.intent.open.file))
+ release_open_intent(&nd);
+ path_put(&nd.path);
+ return ERR_PTR(error);
do_link:
error = -ELOOP;
@@ -1836,43 +1913,60 @@ do_link:
* stored in nd->last.name and we will have to putname() it when we
* are done. Procfs-like symlinks just set LAST_BIND.
*/
- nd->flags |= LOOKUP_PARENT;
- error = security_inode_follow_link(path.dentry, nd);
+ nd.flags |= LOOKUP_PARENT;
+ error = security_inode_follow_link(path.dentry, &nd);
if (error)
goto exit_dput;
- error = __do_follow_link(&path, nd);
+ error = __do_follow_link(&path, &nd);
if (error) {
/* Does someone understand code flow here? Or it is only
* me so stupid? Anathema to whoever designed this non-sense
* with "intent.open".
*/
- release_open_intent(nd);
- return error;
+ release_open_intent(&nd);
+ return ERR_PTR(error);
}
- nd->flags &= ~LOOKUP_PARENT;
- if (nd->last_type == LAST_BIND)
+ nd.flags &= ~LOOKUP_PARENT;
+ if (nd.last_type == LAST_BIND)
goto ok;
error = -EISDIR;
- if (nd->last_type != LAST_NORM)
+ if (nd.last_type != LAST_NORM)
goto exit;
- if (nd->last.name[nd->last.len]) {
- __putname(nd->last.name);
+ if (nd.last.name[nd.last.len]) {
+ __putname(nd.last.name);
goto exit;
}
error = -ELOOP;
if (count++==32) {
- __putname(nd->last.name);
+ __putname(nd.last.name);
goto exit;
}
- dir = nd->path.dentry;
+ dir = nd.path.dentry;
mutex_lock(&dir->d_inode->i_mutex);
- path.dentry = lookup_hash(nd);
- path.mnt = nd->path.mnt;
- __putname(nd->last.name);
+ path.dentry = lookup_hash(&nd);
+ path.mnt = nd.path.mnt;
+ __putname(nd.last.name);
goto do_last;
}
/**
+ * filp_open - open file and return file pointer
+ *
+ * @filename: path to open
+ * @flags: open flags as per the open(2) second argument
+ * @mode: mode for the new file if O_CREAT is set, else ignored
+ *
+ * This is the helper to open a file from kernelspace if you really
+ * have to. But in generally you should not do this, so please move
+ * along, nothing to see here..
+ */
+struct file *filp_open(const char *filename, int flags, int mode)
+{
+ return do_filp_open(AT_FDCWD, filename, flags, mode);
+}
+EXPORT_SYMBOL(filp_open);
+
+/**
* lookup_create - lookup a dentry, creating it if it doesn't exist
* @nd: nameidata info
* @is_dir: directory flag
@@ -1945,6 +2039,23 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
return error;
}
+static int may_mknod(mode_t mode)
+{
+ switch (mode & S_IFMT) {
+ case S_IFREG:
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFIFO:
+ case S_IFSOCK:
+ case 0: /* zero mode translates to S_IFREG */
+ return 0;
+ case S_IFDIR:
+ return -EPERM;
+ default:
+ return -EINVAL;
+ }
+}
+
asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
unsigned dev)
{
@@ -1963,12 +2074,19 @@ asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
if (error)
goto out;
dentry = lookup_create(&nd, 0);
- error = PTR_ERR(dentry);
-
+ if (IS_ERR(dentry)) {
+ error = PTR_ERR(dentry);
+ goto out_unlock;
+ }
if (!IS_POSIXACL(nd.path.dentry->d_inode))
mode &= ~current->fs->umask;
- if (!IS_ERR(dentry)) {
- switch (mode & S_IFMT) {
+ error = may_mknod(mode);
+ if (error)
+ goto out_dput;
+ error = mnt_want_write(nd.path.mnt);
+ if (error)
+ goto out_dput;
+ switch (mode & S_IFMT) {
case 0: case S_IFREG:
error = vfs_create(nd.path.dentry->d_inode,dentry,mode,&nd);
break;
@@ -1979,14 +2097,11 @@ asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
case S_IFIFO: case S_IFSOCK:
error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,0);
break;
- case S_IFDIR:
- error = -EPERM;
- break;
- default:
- error = -EINVAL;
- }
- dput(dentry);
}
+ mnt_drop_write(nd.path.mnt);
+out_dput:
+ dput(dentry);
+out_unlock:
mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
path_put(&nd.path);
out:
@@ -2044,7 +2159,12 @@ asmlinkage long sys_mkdirat(int dfd, const char __user *pathname, int mode)
if (!IS_POSIXACL(nd.path.dentry->d_inode))
mode &= ~current->fs->umask;
+ error = mnt_want_write(nd.path.mnt);
+ if (error)
+ goto out_dput;
error = vfs_mkdir(nd.path.dentry->d_inode, dentry, mode);
+ mnt_drop_write(nd.path.mnt);
+out_dput:
dput(dentry);
out_unlock:
mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
@@ -2151,7 +2271,12 @@ static long do_rmdir(int dfd, const char __user *pathname)
error = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto exit2;
+ error = mnt_want_write(nd.path.mnt);
+ if (error)
+ goto exit3;
error = vfs_rmdir(nd.path.dentry->d_inode, dentry);
+ mnt_drop_write(nd.path.mnt);
+exit3:
dput(dentry);
exit2:
mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
@@ -2232,7 +2357,11 @@ static long do_unlinkat(int dfd, const char __user *pathname)
inode = dentry->d_inode;
if (inode)
atomic_inc(&inode->i_count);
+ error = mnt_want_write(nd.path.mnt);
+ if (error)
+ goto exit2;
error = vfs_unlink(nd.path.dentry->d_inode, dentry);
+ mnt_drop_write(nd.path.mnt);
exit2:
dput(dentry);
}
@@ -2313,7 +2442,12 @@ asmlinkage long sys_symlinkat(const char __user *oldname,
if (IS_ERR(dentry))
goto out_unlock;
+ error = mnt_want_write(nd.path.mnt);
+ if (error)
+ goto out_dput;
error = vfs_symlink(nd.path.dentry->d_inode, dentry, from, S_IALLUGO);
+ mnt_drop_write(nd.path.mnt);
+out_dput:
dput(dentry);
out_unlock:
mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
@@ -2408,7 +2542,12 @@ asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
error = PTR_ERR(new_dentry);
if (IS_ERR(new_dentry))
goto out_unlock;
+ error = mnt_want_write(nd.path.mnt);
+ if (error)
+ goto out_dput;
error = vfs_link(old_nd.path.dentry, nd.path.dentry->d_inode, new_dentry);
+ mnt_drop_write(nd.path.mnt);
+out_dput:
dput(new_dentry);
out_unlock:
mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
@@ -2634,8 +2773,12 @@ static int do_rename(int olddfd, const char *oldname,
if (new_dentry == trap)
goto exit5;
+ error = mnt_want_write(oldnd.path.mnt);
+ if (error)
+ goto exit5;
error = vfs_rename(old_dir->d_inode, old_dentry,
new_dir->d_inode, new_dentry);
+ mnt_drop_write(oldnd.path.mnt);
exit5:
dput(new_dentry);
exit4:
diff --git a/fs/namespace.c b/fs/namespace.c
index 94f026ec990..678f7ce060f 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -17,6 +17,7 @@
#include <linux/quotaops.h>
#include <linux/acct.h>
#include <linux/capability.h>
+#include <linux/cpumask.h>
#include <linux/module.h>
#include <linux/sysfs.h>
#include <linux/seq_file.h>
@@ -55,6 +56,8 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
return tmp & (HASH_SIZE - 1);
}
+#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16)
+
struct vfsmount *alloc_vfsmnt(const char *name)
{
struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
@@ -68,6 +71,7 @@ struct vfsmount *alloc_vfsmnt(const char *name)
INIT_LIST_HEAD(&mnt->mnt_share);
INIT_LIST_HEAD(&mnt->mnt_slave_list);
INIT_LIST_HEAD(&mnt->mnt_slave);
+ atomic_set(&mnt->__mnt_writers, 0);
if (name) {
int size = strlen(name) + 1;
char *newname = kmalloc(size, GFP_KERNEL);
@@ -80,6 +84,263 @@ struct vfsmount *alloc_vfsmnt(const char *name)
return mnt;
}
+/*
+ * Most r/o checks on a fs are for operations that take
+ * discrete amounts of time, like a write() or unlink().
+ * We must keep track of when those operations start
+ * (for permission checks) and when they end, so that
+ * we can determine when writes are able to occur to
+ * a filesystem.
+ */
+/*
+ * __mnt_is_readonly: check whether a mount is read-only
+ * @mnt: the mount to check for its write status
+ *
+ * This shouldn't be used directly ouside of the VFS.
+ * It does not guarantee that the filesystem will stay
+ * r/w, just that it is right *now*. This can not and
+ * should not be used in place of IS_RDONLY(inode).
+ * mnt_want/drop_write() will _keep_ the filesystem
+ * r/w.
+ */
+int __mnt_is_readonly(struct vfsmount *mnt)
+{
+ if (mnt->mnt_flags & MNT_READONLY)
+ return 1;
+ if (mnt->mnt_sb->s_flags & MS_RDONLY)
+ return 1;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__mnt_is_readonly);
+
+struct mnt_writer {
+ /*
+ * If holding multiple instances of this lock, they
+ * must be ordered by cpu number.
+ */
+ spinlock_t lock;
+ struct lock_class_key lock_class; /* compiles out with !lockdep */
+ unsigned long count;
+ struct vfsmount *mnt;
+} ____cacheline_aligned_in_smp;
+static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
+
+static int __init init_mnt_writers(void)
+{
+ int cpu;
+ for_each_possible_cpu(cpu) {
+ struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
+ spin_lock_init(&writer->lock);
+ lockdep_set_class(&writer->lock, &writer->lock_class);
+ writer->count = 0;
+ }
+ return 0;
+}
+fs_initcall(init_mnt_writers);
+
+static void unlock_mnt_writers(void)
+{
+ int cpu;
+ struct mnt_writer *cpu_writer;
+
+ for_each_possible_cpu(cpu) {
+ cpu_writer = &per_cpu(mnt_writers, cpu);
+ spin_unlock(&cpu_writer->lock);
+ }
+}
+
+static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
+{
+ if (!cpu_writer->mnt)
+ return;
+ /*
+ * This is in case anyone ever leaves an invalid,
+ * old ->mnt and a count of 0.
+ */
+ if (!cpu_writer->count)
+ return;
+ atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
+ cpu_writer->count = 0;
+}
+ /*
+ * must hold cpu_writer->lock
+ */
+static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
+ struct vfsmount *mnt)
+{
+ if (cpu_writer->mnt == mnt)
+ return;
+ __clear_mnt_count(cpu_writer);
+ cpu_writer->mnt = mnt;
+}
+
+/*
+ * Most r/o checks on a fs are for operations that take
+ * discrete amounts of time, like a write() or unlink().
+ * We must keep track of when those operations start
+ * (for permission checks) and when they end, so that
+ * we can determine when writes are able to occur to
+ * a filesystem.
+ */
+/**
+ * mnt_want_write - get write access to a mount
+ * @mnt: the mount on which to take a write
+ *
+ * This tells the low-level filesystem that a write is
+ * about to be performed to it, and makes sure that
+ * writes are allowed before returning success. When
+ * the write operation is finished, mnt_drop_write()
+ * must be called. This is effectively a refcount.
+ */
+int mnt_want_write(struct vfsmount *mnt)
+{
+ int ret = 0;
+ struct mnt_writer *cpu_writer;
+
+ cpu_writer = &get_cpu_var(mnt_writers);
+ spin_lock(&cpu_writer->lock);
+ if (__mnt_is_readonly(mnt)) {
+ ret = -EROFS;
+ goto out;
+ }
+ use_cpu_writer_for_mount(cpu_writer, mnt);
+ cpu_writer->count++;
+out:
+ spin_unlock(&cpu_writer->lock);
+ put_cpu_var(mnt_writers);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(mnt_want_write);
+
+static void lock_mnt_writers(void)
+{
+ int cpu;
+ struct mnt_writer *cpu_writer;
+
+ for_each_possible_cpu(cpu) {
+ cpu_writer = &per_cpu(mnt_writers, cpu);
+ spin_lock(&cpu_writer->lock);
+ __clear_mnt_count(cpu_writer);
+ cpu_writer->mnt = NULL;
+ }
+}
+
+/*
+ * These per-cpu write counts are not guaranteed to have
+ * matched increments and decrements on any given cpu.
+ * A file open()ed for write on one cpu and close()d on
+ * another cpu will imbalance this count. Make sure it
+ * does not get too far out of whack.
+ */
+static void handle_write_count_underflow(struct vfsmount *mnt)
+{
+ if (atomic_read(&mnt->__mnt_writers) >=
+ MNT_WRITER_UNDERFLOW_LIMIT)
+ return;
+ /*
+ * It isn't necessary to hold all of the locks
+ * at the same time, but doing it this way makes
+ * us share a lot more code.
+ */
+ lock_mnt_writers();
+ /*
+ * vfsmount_lock is for mnt_flags.
+ */
+ spin_lock(&vfsmount_lock);
+ /*
+ * If coalescing the per-cpu writer counts did not
+ * get us back to a positive writer count, we have
+ * a bug.
+ */
+ if ((atomic_read(&mnt->__mnt_writers) < 0) &&
+ !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
+ printk(KERN_DEBUG "leak detected on mount(%p) writers "
+ "count: %d\n",
+ mnt, atomic_read(&mnt->__mnt_writers));
+ WARN_ON(1);
+ /* use the flag to keep the dmesg spam down */
+ mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
+ }
+ spin_unlock(&vfsmount_lock);
+ unlock_mnt_writers();
+}
+
+/**
+ * mnt_drop_write - give up write access to a mount
+ * @mnt: the mount on which to give up write access
+ *
+ * Tells the low-level filesystem that we are done
+ * performing writes to it. Must be matched with
+ * mnt_want_write() call above.
+ */
+void mnt_drop_write(struct vfsmount *mnt)
+{
+ int must_check_underflow = 0;
+ struct mnt_writer *cpu_writer;
+
+ cpu_writer = &get_cpu_var(mnt_writers);
+ spin_lock(&cpu_writer->lock);
+
+ use_cpu_writer_for_mount(cpu_writer, mnt);
+ if (cpu_writer->count > 0) {
+ cpu_writer->count--;
+ } else {
+ must_check_underflow = 1;
+ atomic_dec(&mnt->__mnt_writers);
+ }
+
+ spin_unlock(&cpu_writer->lock);
+ /*
+ * Logically, we could call this each time,
+ * but the __mnt_writers cacheline tends to
+ * be cold, and makes this expensive.
+ */
+ if (must_check_underflow)
+ handle_write_count_underflow(mnt);
+ /*
+ * This could be done right after the spinlock
+ * is taken because the spinlock keeps us on
+ * the cpu, and disables preemption. However,
+ * putting it here bounds the amount that
+ * __mnt_writers can underflow. Without it,
+ * we could theoretically wrap __mnt_writers.
+ */
+ put_cpu_var(mnt_writers);
+}
+EXPORT_SYMBOL_GPL(mnt_drop_write);
+
+static int mnt_make_readonly(struct vfsmount *mnt)
+{
+ int ret = 0;
+
+ lock_mnt_writers();
+ /*
+ * With all the locks held, this value is stable
+ */
+ if (atomic_read(&mnt->__mnt_writers) > 0) {
+ ret = -EBUSY;
+ goto out;
+ }
+ /*
+ * nobody can do a successful mnt_want_write() with all
+ * of the counts in MNT_DENIED_WRITE and the locks held.
+ */
+ spin_lock(&vfsmount_lock);
+ if (!ret)
+ mnt->mnt_flags |= MNT_READONLY;
+ spin_unlock(&vfsmount_lock);
+out:
+ unlock_mnt_writers();
+ return ret;
+}
+
+static void __mnt_unmake_readonly(struct vfsmount *mnt)
+{
+ spin_lock(&vfsmount_lock);
+ mnt->mnt_flags &= ~MNT_READONLY;
+ spin_unlock(&vfsmount_lock);
+}
+
int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
{
mnt->mnt_sb = sb;
@@ -271,7 +532,36 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
static inline void __mntput(struct vfsmount *mnt)
{
+ int cpu;
struct super_block *sb = mnt->mnt_sb;
+ /*
+ * We don't have to hold all of the locks at the
+ * same time here because we know that we're the
+ * last reference to mnt and that no new writers
+ * can come in.
+ */
+ for_each_possible_cpu(cpu) {
+ struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
+ if (cpu_writer->mnt != mnt)
+ continue;
+ spin_lock(&cpu_writer->lock);
+ atomic_add(cpu_writer->count, &mnt->__mnt_writers);
+ cpu_writer->count = 0;
+ /*
+ * Might as well do this so that no one
+ * ever sees the pointer and expects
+ * it to be valid.
+ */
+ cpu_writer->mnt = NULL;
+ spin_unlock(&cpu_writer->lock);
+ }
+ /*
+ * This probably indicates that somebody messed
+ * up a mnt_want/drop_write() pair. If this
+ * happens, the filesystem was probably unable
+ * to make r/w->r/o transitions.
+ */
+ WARN_ON(atomic_read(&mnt->__mnt_writers));
dput(mnt->mnt_root);
free_vfsmnt(mnt);
deactivate_super(sb);
@@ -417,7 +707,7 @@ static int show_vfsmnt(struct seq_file *m, void *v)
seq_putc(m, '.');
mangle(m, mnt->mnt_sb->s_subtype);
}
- seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? " ro" : " rw");
+ seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw");
for (fs_infop = fs_info; fs_infop->flag; fs_infop++) {
if (mnt->mnt_sb->s_flags & fs_infop->flag)
seq_puts(m, fs_infop->str);
@@ -1019,6 +1309,23 @@ out:
return err;
}
+static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
+{
+ int error = 0;
+ int readonly_request = 0;
+
+ if (ms_flags & MS_RDONLY)
+ readonly_request = 1;
+ if (readonly_request == __mnt_is_readonly(mnt))
+ return 0;
+
+ if (readonly_request)
+ error = mnt_make_readonly(mnt);
+ else
+ __mnt_unmake_readonly(mnt);
+ return error;
+}
+
/*
* change filesystem flags. dir should be a physical root of filesystem.
* If you've mounted a non-root directory somewhere and want to do remount
@@ -1041,7 +1348,10 @@ static noinline int do_remount(struct nameidata *nd, int flags, int mnt_flags,
return -EINVAL;
down_write(&sb->s_umount);
- err = do_remount_sb(sb, flags, data, 0);
+ if (flags & MS_BIND)
+ err = change_mount_flags(nd->path.mnt, flags);
+ else
+ err = do_remount_sb(sb, flags, data, 0);
if (!err)
nd->path.mnt->mnt_flags = mnt_flags;
up_write(&sb->s_umount);
@@ -1425,6 +1735,8 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
mnt_flags |= MNT_NODIRATIME;
if (flags & MS_RELATIME)
mnt_flags |= MNT_RELATIME;
+ if (flags & MS_RDONLY)
+ mnt_flags |= MNT_READONLY;
flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE |
MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT);
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index c67b4bdcf71..ad8f167e54b 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -14,6 +14,7 @@
#include <linux/ioctl.h>
#include <linux/time.h>
#include <linux/mm.h>
+#include <linux/mount.h>
#include <linux/highuid.h>
#include <linux/smp_lock.h>
#include <linux/vmalloc.h>
@@ -261,7 +262,7 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
}
#endif /* CONFIG_NCPFS_NLS */
-int ncp_ioctl(struct inode *inode, struct file *filp,
+static int __ncp_ioctl(struct inode *inode, struct file *filp,
unsigned int cmd, unsigned long arg)
{
struct ncp_server *server = NCP_SERVER(inode);
@@ -822,6 +823,57 @@ outrel:
return -EINVAL;
}
+static int ncp_ioctl_need_write(unsigned int cmd)
+{
+ switch (cmd) {
+ case NCP_IOC_GET_FS_INFO:
+ case NCP_IOC_GET_FS_INFO_V2:
+ case NCP_IOC_NCPREQUEST:
+ case NCP_IOC_SETDENTRYTTL:
+ case NCP_IOC_SIGN_INIT:
+ case NCP_IOC_LOCKUNLOCK:
+ case NCP_IOC_SET_SIGN_WANTED:
+ return 1;
+ case NCP_IOC_GETOBJECTNAME:
+ case NCP_IOC_SETOBJECTNAME:
+ case NCP_IOC_GETPRIVATEDATA:
+ case NCP_IOC_SETPRIVATEDATA:
+ case NCP_IOC_SETCHARSETS:
+ case NCP_IOC_GETCHARSETS:
+ case NCP_IOC_CONN_LOGGED_IN:
+ case NCP_IOC_GETDENTRYTTL:
+ case NCP_IOC_GETMOUNTUID2:
+ case NCP_IOC_SIGN_WANTED:
+ case NCP_IOC_GETROOT:
+ case NCP_IOC_SETROOT:
+ return 0;
+ default:
+ /* unkown IOCTL command, assume write */
+ return 1;
+ }
+}
+
+int ncp_ioctl(struct inode *inode, struct file *filp,
+ unsigned int cmd, unsigned long arg)
+{
+ int ret;
+
+ if (ncp_ioctl_need_write(cmd)) {
+ /*
+ * inside the ioctl(), any failures which
+ * are because of file_permission() are
+ * -EACCESS, so it seems consistent to keep
+ * that here.
+ */
+ if (mnt_want_write(filp->f_path.mnt))
+ return -EACCES;
+ }
+ ret = __ncp_ioctl(inode, filp, cmd, arg);
+ if (ncp_ioctl_need_write(cmd))
+ mnt_drop_write(filp->f_path.mnt);
+ return ret;
+}
+
#ifdef CONFIG_COMPAT
long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 6cea7479c5b..d9e30ac2798 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -967,7 +967,8 @@ static int is_atomic_open(struct inode *dir, struct nameidata *nd)
if (nd->flags & LOOKUP_DIRECTORY)
return 0;
/* Are we trying to write to a read only partition? */
- if (IS_RDONLY(dir) && (nd->intent.open.flags & (O_CREAT|O_TRUNC|FMODE_WRITE)))
+ if (__mnt_is_readonly(nd->path.mnt) &&
+ (nd->intent.open.flags & (O_CREAT|O_TRUNC|FMODE_WRITE)))
return 0;
return 1;
}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index c593db047d8..c309c881bd4 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -658,14 +658,19 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
return status;
}
}
+ status = mnt_want_write(cstate->current_fh.fh_export->ex_path.mnt);
+ if (status)
+ return status;
status = nfs_ok;
if (setattr->sa_acl != NULL)
status = nfsd4_set_nfs4_acl(rqstp, &cstate->current_fh,
setattr->sa_acl);
if (status)
- return status;
+ goto out;
status = nfsd_setattr(rqstp, &cstate->current_fh, &setattr->sa_iattr,
0, (time_t)0);
+out:
+ mnt_drop_write(cstate->current_fh.fh_export->ex_path.mnt);
return status;
}
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 1ff90625860..145b3c877a2 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -46,6 +46,7 @@
#include <linux/scatterlist.h>
#include <linux/crypto.h>
#include <linux/sched.h>
+#include <linux/mount.h>
#define NFSDDBG_FACILITY NFSDDBG_PROC
@@ -154,7 +155,11 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n");
goto out_put;
}
+ status = mnt_want_write(rec_dir.path.mnt);
+ if (status)
+ goto out_put;
status = vfs_mkdir(rec_dir.path.dentry->d_inode, dentry, S_IRWXU);
+ mnt_drop_write(rec_dir.path.mnt);
out_put:
dput(dentry);
out_unlock:
@@ -313,12 +318,17 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
if (!rec_dir_init || !clp->cl_firststate)
return;
+ status = mnt_want_write(rec_dir.path.mnt);
+ if (status)
+ goto out;
clp->cl_firststate = 0;
nfs4_save_user(&uid, &gid);
status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1);
nfs4_reset_user(uid, gid);
if (status == 0)
nfsd4_sync_rec_dir();
+ mnt_drop_write(rec_dir.path.mnt);
+out:
if (status)
printk("NFSD: Failed to remove expired client state directory"
" %.*s\n", HEXDIR_LEN, clp->cl_recdir);
@@ -347,13 +357,17 @@ nfsd4_recdir_purge_old(void) {
if (!rec_dir_init)
return;
+ status = mnt_want_write(rec_dir.path.mnt);
+ if (status)
+ goto out;
status = nfsd4_list_rec_dir(rec_dir.path.dentry, purge_old);
if (status == 0)
nfsd4_sync_rec_dir();
+ mnt_drop_write(rec_dir.path.mnt);
+out:
if (status)
printk("nfsd4: failed to purge old clients from recovery"
" directory %s\n", rec_dir.path.dentry->d_name.name);
- return;
}
static int
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index bcb97d8e8b8..81a75f3081f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -41,6 +41,7 @@
#include <linux/sunrpc/svc.h>
#include <linux/nfsd/nfsd.h>
#include <linux/nfsd/cache.h>
+#include <linux/file.h>
#include <linux/mount.h>
#include <linux/workqueue.h>
#include <linux/smp_lock.h>
@@ -1239,7 +1240,7 @@ static inline void
nfs4_file_downgrade(struct file *filp, unsigned int share_access)
{
if (share_access & NFS4_SHARE_ACCESS_WRITE) {
- put_write_access(filp->f_path.dentry->d_inode);
+ drop_file_write_access(filp);
filp->f_mode = (filp->f_mode | FMODE_READ) & ~FMODE_WRITE;
}
}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 46f59d5365a..304bf5f643c 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1255,23 +1255,35 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
err = 0;
switch (type) {
case S_IFREG:
+ host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
+ if (host_err)
+ goto out_nfserr;
host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
break;
case S_IFDIR:
+ host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
+ if (host_err)
+ goto out_nfserr;
host_err = vfs_mkdir(dirp, dchild, iap->ia_mode);
break;
case S_IFCHR:
case S_IFBLK:
case S_IFIFO:
case S_IFSOCK:
+ host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
+ if (host_err)
+ goto out_nfserr;
host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev);
break;
default:
printk("nfsd: bad file type %o in nfsd_create\n", type);
host_err = -EINVAL;
+ goto out_nfserr;
}
- if (host_err < 0)
+ if (host_err < 0) {
+ mnt_drop_write(fhp->fh_export->ex_path.mnt);
goto out_nfserr;
+ }
if (EX_ISSYNC(fhp->fh_export)) {
err = nfserrno(nfsd_sync_dir(dentry));
@@ -1282,6 +1294,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
err2 = nfsd_create_setattr(rqstp, resfhp, iap);
if (err2)
err = err2;
+ mnt_drop_write(fhp->fh_export->ex_path.mnt);
/*
* Update the file handle to get the new inode info.
*/
@@ -1359,6 +1372,9 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
v_atime = verifier[1]&0x7fffffff;
}
+ host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
+ if (host_err)
+ goto out_nfserr;
if (dchild->d_inode) {
err = 0;
@@ -1390,12 +1406,15 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
case NFS3_CREATE_GUARDED:
err = nfserr_exist;
}
+ mnt_drop_write(fhp->fh_export->ex_path.mnt);
goto out;
}
host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
- if (host_err < 0)
+ if (host_err < 0) {
+ mnt_drop_write(fhp->fh_export->ex_path.mnt);
goto out_nfserr;
+ }
if (created)
*created = 1;
@@ -1420,6 +1439,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (err2)
err = err2;
+ mnt_drop_write(fhp->fh_export->ex_path.mnt);
/*
* Update the filehandle to get the new inode info.
*/
@@ -1522,6 +1542,10 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (iap && (iap->ia_valid & ATTR_MODE))
mode = iap->ia_mode & S_IALLUGO;
+ host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
+ if (host_err)
+ goto out_nfserr;
+
if (unlikely(path[plen] != 0)) {
char *path_alloced = kmalloc(plen+1, GFP_KERNEL);
if (path_alloced == NULL)
@@ -1542,6 +1566,8 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
err = nfserrno(host_err);
fh_unlock(fhp);
+ mnt_drop_write(fhp->fh_export->ex_path.mnt);
+
cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp);
dput(dnew);
if (err==0) err = cerr;
@@ -1592,6 +1618,11 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
dold = tfhp->fh_dentry;
dest = dold->d_inode;
+ host_err = mnt_want_write(tfhp->fh_export->ex_path.mnt);
+ if (host_err) {
+ err = nfserrno(host_err);
+ goto out_dput;
+ }
host_err = vfs_link(dold, dirp, dnew);
if (!host_err) {
if (EX_ISSYNC(ffhp->fh_export)) {
@@ -1605,7 +1636,8 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
else
err = nfserrno(host_err);
}
-
+ mnt_drop_write(tfhp->fh_export->ex_path.mnt);
+out_dput:
dput(dnew);
out_unlock:
fh_unlock(ffhp);
@@ -1678,13 +1710,20 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
if (ndentry == trap)
goto out_dput_new;
-#ifdef MSNFS
- if ((ffhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
+ if (svc_msnfs(ffhp) &&
((atomic_read(&odentry->d_count) > 1)
|| (atomic_read(&ndentry->d_count) > 1))) {
host_err = -EPERM;
- } else
-#endif
+ goto out_dput_new;
+ }
+
+ host_err = -EXDEV;
+ if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt)
+ goto out_dput_new;
+ host_err = mnt_want_write(ffhp->fh_export->ex_path.mnt);
+ if (host_err)
+ goto out_dput_new;
+
host_err = vfs_rename(fdir, odentry, tdir, ndentry);
if (!host_err && EX_ISSYNC(tfhp->fh_export)) {
host_err = nfsd_sync_dir(tdentry);
@@ -1692,6 +1731,8 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
host_err = nfsd_sync_dir(fdentry);
}
+ mnt_drop_write(ffhp->fh_export->ex_path.mnt);
+
out_dput_new:
dput(ndentry);
out_dput_old:
@@ -1750,6 +1791,10 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
if (!type)
type = rdentry->d_inode->i_mode & S_IFMT;
+ host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
+ if (host_err)
+ goto out_nfserr;
+
if (type != S_IFDIR) { /* It's UNLINK */
#ifdef MSNFS
if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
@@ -1765,10 +1810,12 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
dput(rdentry);
if (host_err)
- goto out_nfserr;
+ goto out_drop;
if (EX_ISSYNC(fhp->fh_export))
host_err = nfsd_sync_dir(dentry);
+out_drop:
+ mnt_drop_write(fhp->fh_export->ex_path.mnt);
out_nfserr:
err = nfserrno(host_err);
out:
@@ -1865,7 +1912,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
inode->i_mode,
IS_IMMUTABLE(inode)? " immut" : "",
IS_APPEND(inode)? " append" : "",
- IS_RDONLY(inode)? " ro" : "");
+ __mnt_is_readonly(exp->ex_path.mnt)? " ro" : "");
dprintk(" owner %d/%d user %d/%d\n",
inode->i_uid, inode->i_gid, current->fsuid, current->fsgid);
#endif
@@ -1876,7 +1923,8 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
*/
if (!(acc & MAY_LOCAL_ACCESS))
if (acc & (MAY_WRITE | MAY_SATTR | MAY_TRUNC)) {
- if (exp_rdonly(rqstp, exp) || IS_RDONLY(inode))
+ if (exp_rdonly(rqstp, exp) ||
+ __mnt_is_readonly(exp->ex_path.mnt))
return nfserr_rofs;
if (/* (acc & MAY_WRITE) && */ IS_IMMUTABLE(inode))
return nfserr_perm;
@@ -2039,6 +2087,9 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl)
} else
size = 0;
+ error = mnt_want_write(fhp->fh_export->ex_path.mnt);
+ if (error)
+ goto getout;
if (size)
error = vfs_setxattr(fhp->fh_dentry, name, value, size, 0);
else {
@@ -2050,6 +2101,7 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl)
error = 0;
}
}
+ mnt_drop_write(fhp->fh_export->ex_path.mnt);
getout:
kfree(value);
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index b413166dd16..7b142f0ce99 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -60,10 +60,6 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
goto bail;
}
- status = -EROFS;
- if (IS_RDONLY(inode))
- goto bail_unlock;
-
status = -EACCES;
if (!is_owner_or_cap(inode))
goto bail_unlock;
@@ -134,8 +130,13 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if (get_user(flags, (int __user *) arg))
return -EFAULT;
- return ocfs2_set_inode_attr(inode, flags,
+ status = mnt_want_write(filp->f_path.mnt);
+ if (status)
+ return status;
+ status = ocfs2_set_inode_attr(inode, flags,
OCFS2_FL_MODIFIABLE);
+ mnt_drop_write(filp->f_path.mnt);
+ return status;
case OCFS2_IOC_RESVSP:
case OCFS2_IOC_RESVSP64:
case OCFS2_IOC_UNRESVSP:
diff --git a/fs/open.c b/fs/open.c
index 3fa4e4ffce4..b70e7666bb2 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -244,21 +244,21 @@ static long do_sys_truncate(const char __user * path, loff_t length)
if (!S_ISREG(inode->i_mode))
goto dput_and_out;
- error = vfs_permission(&nd, MAY_WRITE);
+ error = mnt_want_write(nd.path.mnt);
if (error)
goto dput_and_out;
- error = -EROFS;
- if (IS_RDONLY(inode))
- goto dput_and_out;
+ error = vfs_permission(&nd, MAY_WRITE);
+ if (error)
+ goto mnt_drop_write_and_out;
error = -EPERM;
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
- goto dput_and_out;
+ goto mnt_drop_write_and_out;
error = get_write_access(inode);
if (error)
- goto dput_and_out;
+ goto mnt_drop_write_and_out;
/*
* Make sure that there are no leases. get_write_access() protects
@@ -276,6 +276,8 @@ static long do_sys_truncate(const char __user * path, loff_t length)
put_write_and_out:
put_write_access(inode);
+mnt_drop_write_and_out:
+ mnt_drop_write(nd.path.mnt);
dput_and_out:
path_put(&nd.path);
out:
@@ -457,8 +459,17 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
if(res || !(mode & S_IWOTH) ||
special_file(nd.path.dentry->d_inode->i_mode))
goto out_path_release;
-
- if(IS_RDONLY(nd.path.dentry->d_inode))
+ /*
+ * This is a rare case where using __mnt_is_readonly()
+ * is OK without a mnt_want/drop_write() pair. Since
+ * no actual write to the fs is performed here, we do
+ * not need to telegraph to that to anyone.
+ *
+ * By doing this, we accept that this access is
+ * inherently racy and know that the fs may change
+ * state before we even see this result.
+ */
+ if (__mnt_is_readonly(nd.path.mnt))
res = -EROFS;
out_path_release:
@@ -567,12 +578,12 @@ asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
audit_inode(NULL, dentry);
- err = -EROFS;
- if (IS_RDONLY(inode))
+ err = mnt_want_write(file->f_path.mnt);
+ if (err)
goto out_putf;
err = -EPERM;
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
- goto out_putf;
+ goto out_drop_write;
mutex_lock(&inode->i_mutex);
if (mode == (mode_t) -1)
mode = inode->i_mode;
@@ -581,6 +592,8 @@ asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
err = notify_change(dentry, &newattrs);
mutex_unlock(&inode->i_mutex);
+out_drop_write:
+ mnt_drop_write(file->f_path.mnt);
out_putf:
fput(file);
out:
@@ -600,13 +613,13 @@ asmlinkage long sys_fchmodat(int dfd, const char __user *filename,
goto out;
inode = nd.path.dentry->d_inode;
- error = -EROFS;
- if (IS_RDONLY(inode))
+ error = mnt_want_write(nd.path.mnt);
+ if (error)
goto dput_and_out;
error = -EPERM;
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
- goto dput_and_out;
+ goto out_drop_write;
mutex_lock(&inode->i_mutex);
if (mode == (mode_t) -1)
@@ -616,6 +629,8 @@ asmlinkage long sys_fchmodat(int dfd, const char __user *filename,
error = notify_change(nd.path.dentry, &newattrs);
mutex_unlock(&inode->i_mutex);
+out_drop_write:
+ mnt_drop_write(nd.path.mnt);
dput_and_out:
path_put(&nd.path);
out:
@@ -638,9 +653,6 @@ static int chown_common(struct dentry * dentry, uid_t user, gid_t group)
printk(KERN_ERR "chown_common: NULL inode\n");
goto out;
}
- error = -EROFS;
- if (IS_RDONLY(inode))
- goto out;
error = -EPERM;
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
goto out;
@@ -671,7 +683,12 @@ asmlinkage long sys_chown(const char __user * filename, uid_t user, gid_t group)
error = user_path_walk(filename, &nd);
if (error)
goto out;
+ error = mnt_want_write(nd.path.mnt);
+ if (error)
+ goto out_release;
error = chown_common(nd.path.dentry, user, group);
+ mnt_drop_write(nd.path.mnt);
+out_release:
path_put(&nd.path);
out:
return error;
@@ -691,7 +708,12 @@ asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user,
error = __user_walk_fd(dfd, filename, follow, &nd);
if (error)
goto out;
+ error = mnt_want_write(nd.path.mnt);
+ if (error)
+ goto out_release;
error = chown_common(nd.path.dentry, user, group);
+ mnt_drop_write(nd.path.mnt);
+out_release:
path_put(&nd.path);
out:
return error;
@@ -705,7 +727,12 @@ asmlinkage long sys_lchown(const char __user * filename, uid_t user, gid_t group
error = user_path_walk_link(filename, &nd);
if (error)
goto out;
+ error = mnt_want_write(nd.path.mnt);
+ if (error)
+ goto out_release;
error = chown_common(nd.path.dentry, user, group);
+ mnt_drop_write(nd.path.mnt);
+out_release:
path_put(&nd.path);
out:
return error;
@@ -722,14 +749,48 @@ asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group)
if (!file)
goto out;
+ error = mnt_want_write(file->f_path.mnt);
+ if (error)
+ goto out_fput;
dentry = file->f_path.dentry;
audit_inode(NULL, dentry);
error = chown_common(dentry, user, group);
+ mnt_drop_write(file->f_path.mnt);
+out_fput:
fput(file);
out:
return error;
}
+/*
+ * You have to be very careful that these write
+ * counts get cleaned up in error cases and
+ * upon __fput(). This should probably never
+ * be called outside of __dentry_open().
+ */
+static inline int __get_file_write_access(struct inode *inode,
+ struct vfsmount *mnt)
+{
+ int error;
+ error = get_write_access(inode);
+ if (error)
+ return error;
+ /*
+ * Do not take mount writer counts on
+ * special files since no writes to
+ * the mount itself will occur.
+ */
+ if (!special_file(inode->i_mode)) {
+ /*
+ * Balanced in __fput()
+ */
+ error = mnt_want_write(mnt);
+ if (error)
+ put_write_access(inode);
+ }
+ return error;
+}
+
static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
int flags, struct file *f,
int (*open)(struct inode *, struct file *))
@@ -742,9 +803,11 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
FMODE_PREAD | FMODE_PWRITE;
inode = dentry->d_inode;
if (f->f_mode & FMODE_WRITE) {
- error = get_write_access(inode);
+ error = __get_file_write_access(inode, mnt);
if (error)
goto cleanup_file;
+ if (!special_file(inode->i_mode))
+ file_take_write(f);
}
f->f_mapping = inode->i_mapping;
@@ -784,8 +847,19 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
cleanup_all:
fops_put(f->f_op);
- if (f->f_mode & FMODE_WRITE)
+ if (f->f_mode & FMODE_WRITE) {
put_write_access(inode);
+ if (!special_file(inode->i_mode)) {
+ /*
+ * We don't consider this a real
+ * mnt_want/drop_write() pair
+ * because it all happenend right
+ * here, so just reset the state.
+ */
+ file_reset_write(f);
+ mnt_drop_write(mnt);
+ }
+ }
file_kill(f);
f->f_path.dentry = NULL;
f->f_path.mnt = NULL;
@@ -796,43 +870,6 @@ cleanup_file:
return ERR_PTR(error);
}
-/*
- * Note that while the flag value (low two bits) for sys_open means:
- * 00 - read-only
- * 01 - write-only
- * 10 - read-write
- * 11 - special
- * it is changed into
- * 00 - no permissions needed
- * 01 - read-permission
- * 10 - write-permission
- * 11 - read-write
- * for the internal routines (ie open_namei()/follow_link() etc). 00 is
- * used by symlinks.
- */
-static struct file *do_filp_open(int dfd, const char *filename, int flags,
- int mode)
-{
- int namei_flags, error;
- struct nameidata nd;
-
- namei_flags = flags;
- if ((namei_flags+1) & O_ACCMODE)
- namei_flags++;
-
- error = open_namei(dfd, filename, namei_flags, mode, &nd);
- if (!error)
- return nameidata_to_filp(&nd, flags);
-
- return ERR_PTR(error);
-}
-
-struct file *filp_open(const char *filename, int flags, int mode)
-{
- return do_filp_open(AT_FDCWD, filename, flags, mode);
-}
-EXPORT_SYMBOL(filp_open);
-
/**
* lookup_instantiate_filp - instantiates the open intent filp
* @nd: pointer to nameidata
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 03f808c5b79..6149e4b58c8 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -473,6 +473,10 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
return 0;
if (IS_ERR(state)) /* I/O error reading the partition table */
return -EIO;
+
+ /* tell userspace that the media / partition table may have changed */
+ kobject_uevent(&disk->dev.kobj, KOBJ_CHANGE);
+
for (p = 1; p < state->limit; p++) {
sector_t size = state->parts[p].size;
sector_t from = state->parts[p].from;
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index e0f0f098a52..74363a7aacb 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -4,6 +4,7 @@
#include <linux/capability.h>
#include <linux/fs.h>
+#include <linux/mount.h>
#include <linux/reiserfs_fs.h>
#include <linux/time.h>
#include <asm/uaccess.h>
@@ -25,6 +26,7 @@ int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
unsigned long arg)
{
unsigned int flags;
+ int err = 0;
switch (cmd) {
case REISERFS_IOC_UNPACK:
@@ -48,50 +50,67 @@ int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
if (!reiserfs_attrs(inode->i_sb))
return -ENOTTY;
- if (IS_RDONLY(inode))
- return -EROFS;
+ err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+ return err;
- if (!is_owner_or_cap(inode))
- return -EPERM;
-
- if (get_user(flags, (int __user *)arg))
- return -EFAULT;
-
- /* Is it quota file? Do not allow user to mess with it. */
- if (IS_NOQUOTA(inode))
- return -EPERM;
+ if (!is_owner_or_cap(inode)) {
+ err = -EPERM;
+ goto setflags_out;
+ }
+ if (get_user(flags, (int __user *)arg)) {
+ err = -EFAULT;
+ goto setflags_out;
+ }
+ /*
+ * Is it quota file? Do not allow user to mess with it
+ */
+ if (IS_NOQUOTA(inode)) {
+ err = -EPERM;
+ goto setflags_out;
+ }
if (((flags ^ REISERFS_I(inode)->
i_attrs) & (REISERFS_IMMUTABLE_FL |
REISERFS_APPEND_FL))
- && !capable(CAP_LINUX_IMMUTABLE))
- return -EPERM;
-
+ && !capable(CAP_LINUX_IMMUTABLE)) {
+ err = -EPERM;
+ goto setflags_out;
+ }
if ((flags & REISERFS_NOTAIL_FL) &&
S_ISREG(inode->i_mode)) {
int result;
result = reiserfs_unpack(inode, filp);
- if (result)
- return result;
+ if (result) {
+ err = result;
+ goto setflags_out;
+ }
}
sd_attrs_to_i_attrs(flags, inode);
REISERFS_I(inode)->i_attrs = flags;
inode->i_ctime = CURRENT_TIME_SEC;
mark_inode_dirty(inode);
- return 0;
+setflags_out:
+ mnt_drop_write(filp->f_path.mnt);
+ return err;
}
case REISERFS_IOC_GETVERSION:
return put_user(inode->i_generation, (int __user *)arg);
case REISERFS_IOC_SETVERSION:
if (!is_owner_or_cap(inode))
return -EPERM;
- if (IS_RDONLY(inode))
- return -EROFS;
- if (get_user(inode->i_generation, (int __user *)arg))
- return -EFAULT;
+ err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+ return err;
+ if (get_user(inode->i_generation, (int __user *)arg)) {
+ err = -EFAULT;
+ goto setversion_out;
+ }
inode->i_ctime = CURRENT_TIME_SEC;
mark_inode_dirty(inode);
- return 0;
+setversion_out:
+ mnt_drop_write(filp->f_path.mnt);
+ return err;
default:
return -ENOTTY;
}
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index bb05a3e51b9..060eb3f598e 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -38,7 +38,7 @@
#include <asm/system.h>
#include <linux/time.h>
-#include <asm/semaphore.h>
+#include <linux/semaphore.h>
#include <linux/vmalloc.h>
#include <linux/reiserfs_fs.h>
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 344b9b96cc5..d7c4935c103 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -44,7 +44,6 @@
#include <net/checksum.h>
#include <linux/smp_lock.h>
#include <linux/stat.h>
-#include <asm/semaphore.h>
#define FL_READONLY 128
#define FL_DIR_SEM_HELD 256
diff --git a/fs/super.c b/fs/super.c
index 09008dbd264..1f8f05ede43 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -37,6 +37,7 @@
#include <linux/idr.h>
#include <linux/kobject.h>
#include <linux/mutex.h>
+#include <linux/file.h>
#include <asm/uaccess.h>
@@ -567,10 +568,29 @@ static void mark_files_ro(struct super_block *sb)
{
struct file *f;
+retry:
file_list_lock();
list_for_each_entry(f, &sb->s_files, f_u.fu_list) {
- if (S_ISREG(f->f_path.dentry->d_inode->i_mode) && file_count(f))
- f->f_mode &= ~FMODE_WRITE;
+ struct vfsmount *mnt;
+ if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
+ continue;
+ if (!file_count(f))
+ continue;
+ if (!(f->f_mode & FMODE_WRITE))
+ continue;
+ f->f_mode &= ~FMODE_WRITE;
+ if (file_check_writeable(f) != 0)
+ continue;
+ file_release_write(f);
+ mnt = mntget(f->f_path.mnt);
+ file_list_unlock();
+ /*
+ * This can sleep, so we can't hold
+ * the file_list_lock() spinlock.
+ */
+ mnt_drop_write(mnt);
+ mntput(mnt);
+ goto retry;
}
file_list_unlock();
}
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 4948d9bc405..a1c3a1fab7f 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -20,6 +20,7 @@
#include <linux/idr.h>
#include <linux/completion.h>
#include <linux/mutex.h>
+#include <linux/slab.h>
#include "sysfs.h"
DEFINE_MUTEX(sysfs_mutex);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index baa663e6938..ade9a7e6a75 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -13,6 +13,7 @@
#include <linux/module.h>
#include <linux/kobject.h>
#include <linux/kallsyms.h>
+#include <linux/slab.h>
#include <linux/namei.h>
#include <linux/poll.h>
#include <linux/list.h>
@@ -128,7 +129,7 @@ sysfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
ssize_t retval = 0;
mutex_lock(&buffer->mutex);
- if (buffer->needs_read_fill) {
+ if (buffer->needs_read_fill || *ppos == 0) {
retval = fill_read_buffer(file->f_path.dentry,buffer);
if (retval)
goto out;
@@ -409,8 +410,7 @@ static int sysfs_release(struct inode *inode, struct file *filp)
* return POLLERR|POLLPRI, and select will return the fd whether
* it is waiting for read, write, or exceptions.
* Once poll/select indicates that the value has changed, you
- * need to close and re-open the file, as simply seeking and reading
- * again will not get new data, or reset the state of 'poll'.
+ * need to close and re-open the file, or seek to 0 and read again.
* Reminder: this only works for attributes which actively support
* it, and it is not possible to test an attribute from userspace
* to see if it supports poll (Neither 'poll' nor 'select' return
diff --git a/fs/utimes.c b/fs/utimes.c
index b18da9c0b97..a2bef77dc9c 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -2,6 +2,7 @@
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/linkage.h>
+#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/sched.h>
#include <linux/stat.h>
@@ -59,6 +60,7 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
struct inode *inode;
struct iattr newattrs;
struct file *f = NULL;
+ struct vfsmount *mnt;
error = -EINVAL;
if (times && (!nsec_valid(times[0].tv_nsec) ||
@@ -79,18 +81,20 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
if (!f)
goto out;
dentry = f->f_path.dentry;
+ mnt = f->f_path.mnt;
} else {
error = __user_walk_fd(dfd, filename, (flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW, &nd);
if (error)
goto out;
dentry = nd.path.dentry;
+ mnt = nd.path.mnt;
}
inode = dentry->d_inode;
- error = -EROFS;
- if (IS_RDONLY(inode))
+ error = mnt_want_write(mnt);
+ if (error)
goto dput_and_out;
/* Don't worry, the checks are done in inode_change_ok() */
@@ -98,7 +102,7 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
if (times) {
error = -EPERM;
if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
- goto dput_and_out;
+ goto mnt_drop_write_and_out;
if (times[0].tv_nsec == UTIME_OMIT)
newattrs.ia_valid &= ~ATTR_ATIME;
@@ -118,22 +122,24 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
} else {
error = -EACCES;
if (IS_IMMUTABLE(inode))
- goto dput_and_out;
+ goto mnt_drop_write_and_out;
if (!is_owner_or_cap(inode)) {
if (f) {
if (!(f->f_mode & FMODE_WRITE))
- goto dput_and_out;
+ goto mnt_drop_write_and_out;
} else {
error = vfs_permission(&nd, MAY_WRITE);
if (error)
- goto dput_and_out;
+ goto mnt_drop_write_and_out;
}
}
}
mutex_lock(&inode->i_mutex);
error = notify_change(dentry, &newattrs);
mutex_unlock(&inode->i_mutex);
+mnt_drop_write_and_out:
+ mnt_drop_write(mnt);
dput_and_out:
if (f)
fput(f);
diff --git a/fs/xattr.c b/fs/xattr.c
index 3acab161546..f7062da505d 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -11,6 +11,7 @@
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/xattr.h>
+#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/security.h>
#include <linux/syscalls.h>
@@ -32,8 +33,6 @@ xattr_permission(struct inode *inode, const char *name, int mask)
* filesystem or on an immutable / append-only inode.
*/
if (mask & MAY_WRITE) {
- if (IS_RDONLY(inode))
- return -EROFS;
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
return -EPERM;
}
@@ -262,7 +261,11 @@ sys_setxattr(char __user *path, char __user *name, void __user *value,
error = user_path_walk(path, &nd);
if (error)
return error;
- error = setxattr(nd.path.dentry, name, value, size, flags);
+ error = mnt_want_write(nd.path.mnt);
+ if (!error) {
+ error = setxattr(nd.path.dentry, name, value, size, flags);
+ mnt_drop_write(nd.path.mnt);
+ }
path_put(&nd.path);
return error;
}
@@ -277,7 +280,11 @@ sys_lsetxattr(char __user *path, char __user *name, void __user *value,
error = user_path_walk_link(path, &nd);
if (error)
return error;
- error = setxattr(nd.path.dentry, name, value, size, flags);
+ error = mnt_want_write(nd.path.mnt);
+ if (!error) {
+ error = setxattr(nd.path.dentry, name, value, size, flags);
+ mnt_drop_write(nd.path.mnt);
+ }
path_put(&nd.path);
return error;
}
@@ -295,7 +302,12 @@ sys_fsetxattr(int fd, char __user *name, void __user *value,
return error;
dentry = f->f_path.dentry;
audit_inode(NULL, dentry);
- error = setxattr(dentry, name, value, size, flags);
+ error = mnt_want_write(f->f_path.mnt);
+ if (!error) {
+ error = setxattr(dentry, name, value, size, flags);
+ mnt_drop_write(f->f_path.mnt);
+ }
+out_fput:
fput(f);
return error;
}
@@ -482,7 +494,11 @@ sys_removexattr(char __user *path, char __user *name)
error = user_path_walk(path, &nd);
if (error)
return error;
- error = removexattr(nd.path.dentry, name);
+ error = mnt_want_write(nd.path.mnt);
+ if (!error) {
+ error = removexattr(nd.path.dentry, name);
+ mnt_drop_write(nd.path.mnt);
+ }
path_put(&nd.path);
return error;
}
@@ -496,7 +512,11 @@ sys_lremovexattr(char __user *path, char __user *name)
error = user_path_walk_link(path, &nd);
if (error)
return error;
- error = removexattr(nd.path.dentry, name);
+ error = mnt_want_write(nd.path.mnt);
+ if (!error) {
+ error = removexattr(nd.path.dentry, name);
+ mnt_drop_write(nd.path.mnt);
+ }
path_put(&nd.path);
return error;
}
@@ -513,7 +533,11 @@ sys_fremovexattr(int fd, char __user *name)
return error;
dentry = f->f_path.dentry;
audit_inode(NULL, dentry);
- error = removexattr(dentry, name);
+ error = mnt_want_write(f->f_path.mnt);
+ if (!error) {
+ error = removexattr(dentry, name);
+ mnt_drop_write(f->f_path.mnt);
+ }
fput(f);
return error;
}
diff --git a/fs/xfs/linux-2.6/sema.h b/fs/xfs/linux-2.6/sema.h
index 2009e6d922c..3abe7e9ceb3 100644
--- a/fs/xfs/linux-2.6/sema.h
+++ b/fs/xfs/linux-2.6/sema.h
@@ -20,8 +20,8 @@
#include <linux/time.h>
#include <linux/wait.h>
+#include <linux/semaphore.h>
#include <asm/atomic.h>
-#include <asm/semaphore.h>
/*
* sema_t structure just maps to struct semaphore in Linux kernel.
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index bf775979385..4ddb86b73c6 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -535,8 +535,6 @@ xfs_attrmulti_attr_set(
char *kbuf;
int error = EFAULT;
- if (IS_RDONLY(inode))
- return -EROFS;
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
return EPERM;
if (len > XATTR_SIZE_MAX)
@@ -562,8 +560,6 @@ xfs_attrmulti_attr_remove(
char *name,
__uint32_t flags)
{
- if (IS_RDONLY(inode))
- return -EROFS;
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
return EPERM;
return xfs_attr_remove(XFS_I(inode), name, flags);
@@ -573,6 +569,7 @@ STATIC int
xfs_attrmulti_by_handle(
xfs_mount_t *mp,
void __user *arg,
+ struct file *parfilp,
struct inode *parinode)
{
int error;
@@ -626,13 +623,21 @@ xfs_attrmulti_by_handle(
&ops[i].am_length, ops[i].am_flags);
break;
case ATTR_OP_SET:
+ ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
+ if (ops[i].am_error)
+ break;
ops[i].am_error = xfs_attrmulti_attr_set(inode,
attr_name, ops[i].am_attrvalue,
ops[i].am_length, ops[i].am_flags);
+ mnt_drop_write(parfilp->f_path.mnt);
break;
case ATTR_OP_REMOVE:
+ ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
+ if (ops[i].am_error)
+ break;
ops[i].am_error = xfs_attrmulti_attr_remove(inode,
attr_name, ops[i].am_flags);
+ mnt_drop_write(parfilp->f_path.mnt);
break;
default:
ops[i].am_error = EINVAL;
@@ -1133,7 +1138,7 @@ xfs_ioctl(
return xfs_attrlist_by_handle(mp, arg, inode);
case XFS_IOC_ATTRMULTI_BY_HANDLE:
- return xfs_attrmulti_by_handle(mp, arg, inode);
+ return xfs_attrmulti_by_handle(mp, arg, filp, inode);
case XFS_IOC_SWAPEXT: {
error = xfs_swapext((struct xfs_swapext __user *)arg);
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 0c958cf7775..a1237dad643 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -155,13 +155,6 @@ xfs_ichgtime_fast(
*/
ASSERT((flags & XFS_ICHGTIME_ACC) == 0);
- /*
- * We're not supposed to change timestamps in readonly-mounted
- * filesystems. Throw it away if anyone asks us.
- */
- if (unlikely(IS_RDONLY(inode)))
- return;
-
if (flags & XFS_ICHGTIME_MOD) {
tvp = &inode->i_mtime;
ip->i_d.di_mtime.t_sec = (__int32_t)tvp->tv_sec;
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 21c0dbc7409..1ebd8004469 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -51,6 +51,7 @@
#include "xfs_vnodeops.h"
#include <linux/capability.h>
+#include <linux/mount.h>
#include <linux/writeback.h>
@@ -670,10 +671,16 @@ start:
if (new_size > xip->i_size)
xip->i_new_size = new_size;
- if (likely(!(ioflags & IO_INVIS))) {
+ /*
+ * We're not supposed to change timestamps in readonly-mounted
+ * filesystems. Throw it away if anyone asks us.
+ */
+ if (likely(!(ioflags & IO_INVIS) &&
+ !mnt_want_write(file->f_path.mnt))) {
file_update_time(file);
xfs_ichgtime_fast(xip, inode,
XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ mnt_drop_write(file->f_path.mnt);
}
/*