From 4a3fd211ccfc08a88edc824300e25a87785c6a5f Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 15 Feb 2008 14:37:48 -0800 Subject: [PATCH] r/o bind mounts: elevate write count for open()s This is the first really tricky patch in the series. It elevates the writer count on a mount each time a non-special file is opened for write. We used to do this in may_open(), but Miklos pointed out that __dentry_open() is used as well to create filps. This will cover even those cases, while a call in may_open() would not have. There is also an elevated count around the vfs_create() call in open_namei(). See the comments for more details, but we need this to fix a 'create, remount, fail r/w open()' race. Some filesystems forego the use of normal vfs calls to create struct files. Make sure that these users elevate the mnt writer count because they will get __fput(), and we need to make sure they're balanced. Acked-by: Al Viro Signed-off-by: Christoph Hellwig Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/namei.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 65 insertions(+), 10 deletions(-) (limited to 'fs/namei.c') diff --git a/fs/namei.c b/fs/namei.c index 83c843b3fea..e179f71bfcb 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1623,8 +1623,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag) return -EACCES; flag &= ~O_TRUNC; - } else if (IS_RDONLY(inode) && (acc_mode & MAY_WRITE)) - return -EROFS; + } error = vfs_permission(nd, acc_mode); if (error) @@ -1724,18 +1723,32 @@ static inline int open_to_namei_flags(int flag) return flag; } +static int open_will_write_to_fs(int flag, struct inode *inode) +{ + /* + * We'll never write to the fs underlying + * a device file. + */ + if (special_file(inode->i_mode)) + return 0; + return (flag & O_TRUNC); +} + /* - * Note that the low bits of "flag" aren't the same as in the open - * system call. See open_to_namei_flags(). + * Note that the low bits of the passed in "open_flag" + * are not the same as in the local variable "flag". See + * open_to_namei_flags() for more details. */ struct file *do_filp_open(int dfd, const char *pathname, int open_flag, int mode) { + struct file *filp; struct nameidata nd; int acc_mode, error; struct path path; struct dentry *dir; int count = 0; + int will_write; int flag = open_to_namei_flags(open_flag); acc_mode = ACC_MODE(flag); @@ -1791,17 +1804,30 @@ do_last: } if (IS_ERR(nd.intent.open.file)) { - mutex_unlock(&dir->d_inode->i_mutex); error = PTR_ERR(nd.intent.open.file); - goto exit_dput; + goto exit_mutex_unlock; } /* Negative dentry, just create the file */ if (!path.dentry->d_inode) { - error = __open_namei_create(&nd, &path, flag, mode); + /* + * This write is needed to ensure that a + * ro->rw transition does not occur between + * the time when the file is created and when + * a permanent write count is taken through + * the 'struct file' in nameidata_to_filp(). + */ + error = mnt_want_write(nd.path.mnt); if (error) + goto exit_mutex_unlock; + error = __open_namei_create(&nd, &path, flag, mode); + if (error) { + mnt_drop_write(nd.path.mnt); goto exit; - return nameidata_to_filp(&nd, open_flag); + } + filp = nameidata_to_filp(&nd, open_flag); + mnt_drop_write(nd.path.mnt); + return filp; } /* @@ -1831,11 +1857,40 @@ do_last: if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode)) goto exit; ok: + /* + * Consider: + * 1. may_open() truncates a file + * 2. a rw->ro mount transition occurs + * 3. nameidata_to_filp() fails due to + * the ro mount. + * That would be inconsistent, and should + * be avoided. Taking this mnt write here + * ensures that (2) can not occur. + */ + will_write = open_will_write_to_fs(flag, nd.path.dentry->d_inode); + if (will_write) { + error = mnt_want_write(nd.path.mnt); + if (error) + goto exit; + } error = may_open(&nd, acc_mode, flag); - if (error) + if (error) { + if (will_write) + mnt_drop_write(nd.path.mnt); goto exit; - return nameidata_to_filp(&nd, open_flag); + } + filp = nameidata_to_filp(&nd, open_flag); + /* + * It is now safe to drop the mnt write + * because the filp has had a write taken + * on its behalf. + */ + if (will_write) + mnt_drop_write(nd.path.mnt); + return filp; +exit_mutex_unlock: + mutex_unlock(&dir->d_inode->i_mutex); exit_dput: path_put_conditional(&path, &nd); exit: -- cgit v1.2.3