From 0820e15a35b3cf37caadf550ddb7c75a7a77afd0 Mon Sep 17 00:00:00 2001 From: Steve French Date: Mon, 23 Jan 2006 12:50:04 -0800 Subject: [CIFS] Do not zero non-existent iovec in SendReceive response processing. Could cause memory leak in some readpaths depending on what junk followed it in the stack. Signed-off-by: Steve French --- fs/cifs/transport.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 7b98792150e..b12cb8a7da7 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -498,7 +498,6 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, else *pRespBufType = CIFS_SMALL_BUFFER; iov[0].iov_len = receive_len + 4; - iov[1].iov_len = 0; dump_smb(midQ->resp_buf, 80); /* convert the length into a more usable form */ -- cgit v1.2.3 From 17cbbafe8e82bde4258e407ce043b61f4f9a350f Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 24 Jan 2006 20:26:48 -0800 Subject: [CIFS] Make cifs default wsize match what we actually want to send (52K typically - header + 13 pages). Forgetting to set wsize on the mount command costs more than 10% on large write (can be much more) so this makes a saner default. We still shrink this default smaller if server can not support it. Signed-off-by: Steve French --- fs/cifs/connect.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 88f60aa5205..eae306fa24b 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1785,7 +1785,15 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, } else if(volume_info.wsize) cifs_sb->wsize = volume_info.wsize; else - cifs_sb->wsize = CIFSMaxBufSize; /* default */ + cifs_sb->wsize = + min(PAGEVEC_SIZE * PAGE_CACHE_SIZE, 127*1024); + /* old default of CIFSMaxBufSize was too small now + that SMB Write2 can send multiple pages in kvec. + RFC1001 does not describe what happens when frame + bigger than 128K is sent so use that as max in + conjunction with 52K kvec constraint on arch with 4K + page size */ + if(cifs_sb->rsize < PAGE_CACHE_SIZE) { cifs_sb->rsize = PAGE_CACHE_SIZE; /* Windows ME does this */ -- cgit v1.2.3 From eb9bdaa3f3b9d30d09bcad47037216aa39639b8e Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 27 Jan 2006 15:11:47 -0800 Subject: Signed-off-by: Steve French --- fs/cifs/file.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 77c990f0cb9..d17c97d07c8 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1190,7 +1190,6 @@ retry: /* BB what if continued retry is requested via mount flags? */ set_bit(AS_EIO, &mapping->flags); - SetPageError(page); } else { cifs_stats_bytes_written(cifs_sb->tcon, bytes_written); @@ -1198,6 +1197,13 @@ retry: } for (i = 0; i < n_iov; i++) { page = pvec.pages[first + i]; + /* Should we also set page error on + success rc but too little data written? */ + /* BB investigate retry logic on temporary + server crash cases and how recovery works + when page marked as error */ + if(rc) + SetPageError(page); kunmap(page); unlock_page(page); page_cache_release(page); -- cgit v1.2.3 From 1877c9ea66a29563987f22d0a86c66f438a87ce2 Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 27 Jan 2006 18:36:11 -0800 Subject: [CIFS] Remove compiler warning Signed-off-by: Steve French --- fs/cifs/connect.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index eae306fa24b..e488603fb1e 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1786,7 +1786,8 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, cifs_sb->wsize = volume_info.wsize; else cifs_sb->wsize = - min(PAGEVEC_SIZE * PAGE_CACHE_SIZE, 127*1024); + min_t(const int, PAGEVEC_SIZE * PAGE_CACHE_SIZE, + 127*1024); /* old default of CIFSMaxBufSize was too small now that SMB Write2 can send multiple pages in kvec. RFC1001 does not describe what happens when frame -- cgit v1.2.3 From fddfdeafa8396f85c666bfc5e1e920eb535514cf Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 31 Jan 2006 15:24:34 +0100 Subject: [BLOCK] A few kerneldoc fixups Signed-off-by: Jens Axboe --- fs/bio.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index bbc442b8c86..1f3bb501c26 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -411,6 +411,7 @@ static int __bio_add_page(request_queue_t *q, struct bio *bio, struct page /** * bio_add_pc_page - attempt to add page to bio + * @q: the target queue * @bio: destination bio * @page: page to add * @len: vec entry length -- cgit v1.2.3 From 3a69c7dc6f3d58aeb9ce5051fc7060d55e05c003 Mon Sep 17 00:00:00 2001 From: Yingping Lu Date: Wed, 1 Feb 2006 12:14:34 +1100 Subject: [XFS] Interim solution for attribute insertion failure during file creation due to ENOSPC. The current solution removes the inode when the attribute insertion fails. Long term solution would be to make the inode creation and attribute insertion atomic. SGI-PV: 947610 SGI-Modid: xfs-linux-melb:xfs-kern:205193a Signed-off-by: Yingping Lu Signed-off-by: Nathan Scott --- fs/xfs/linux-2.6/xfs_iops.c | 50 +++++++++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 76c6df34d0d..eda7919b70a 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -262,6 +262,31 @@ has_fs_struct(struct task_struct *task) return (task->fs != init_task.fs); } +STATIC inline void +cleanup_inode( + vnode_t *dvp, + vnode_t *vp, + struct dentry *dentry, + int mode) +{ + struct dentry teardown = {}; + int err2; + + /* Oh, the horror. + * If we can't add the ACL or we fail in + * linvfs_init_security we must back out. + * ENOSPC can hit here, among other things. + */ + teardown.d_inode = LINVFS_GET_IP(vp); + teardown.d_name = dentry->d_name; + + if (S_ISDIR(mode)) + VOP_RMDIR(dvp, &teardown, NULL, err2); + else + VOP_REMOVE(dvp, &teardown, NULL, err2); + VN_RELE(vp); +} + STATIC int linvfs_mknod( struct inode *dir, @@ -316,30 +341,19 @@ linvfs_mknod( } if (!error) + { error = linvfs_init_security(vp, dir); + if (error) + cleanup_inode(dvp, vp, dentry, mode); + } if (default_acl) { if (!error) { error = _ACL_INHERIT(vp, &va, default_acl); - if (!error) { + if (!error) VMODIFY(vp); - } else { - struct dentry teardown = {}; - int err2; - - /* Oh, the horror. - * If we can't add the ACL we must back out. - * ENOSPC can hit here, among other things. - */ - teardown.d_inode = ip = LINVFS_GET_IP(vp); - teardown.d_name = dentry->d_name; - - if (S_ISDIR(mode)) - VOP_RMDIR(dvp, &teardown, NULL, err2); - else - VOP_REMOVE(dvp, &teardown, NULL, err2); - VN_RELE(vp); - } + else + cleanup_inode(dvp, vp, dentry, mode); } _ACL_FREE(default_acl); } -- cgit v1.2.3 From fad3aa1e8e2e4123a19b926fefd91ec63dd56497 Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Wed, 1 Feb 2006 12:14:52 +1100 Subject: [XFS] Fix regression in xfs_buf_rele dealing with non-hashed buffers, as occur during log replay. Novell bug 145204, Fedora bug 177848. SGI-PV: 948860 SGI-Modid: xfs-linux-melb:xfs-kern:25064a Signed-off-by: Nathan Scott --- fs/xfs/linux-2.6/xfs_buf.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index e44b7c1a3a3..a36a8e3b703 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -822,6 +822,13 @@ xfs_buf_rele( XB_TRACE(bp, "rele", bp->b_relse); + if (unlikely(!hash)) { + ASSERT(!bp->b_relse); + if (atomic_dec_and_test(&bp->b_hold)) + xfs_buf_free(bp); + return; + } + if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) { if (bp->b_relse) { atomic_inc(&bp->b_hold); -- cgit v1.2.3 From 3fb803a990cd17546bd89c38e0e29a891f71ce7d Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Wed, 1 Feb 2006 03:04:34 -0800 Subject: [PATCH] knfsd: Restore recently broken ACL functionality to NFS server A recent patch to Allow run-time selection of NFS versions to export meant that NO nfsacl service versions were exported. This patch restored that functionality. Signed-off-by: Andreas Gruenbacher Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfssvc.c | 76 ++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 49 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 89ed0469686..1d163b61691 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -64,6 +64,32 @@ struct nfsd_list { }; static struct list_head nfsd_list = LIST_HEAD_INIT(nfsd_list); +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) +static struct svc_stat nfsd_acl_svcstats; +static struct svc_version * nfsd_acl_version[] = { + [2] = &nfsd_acl_version2, + [3] = &nfsd_acl_version3, +}; + +#define NFSD_ACL_MINVERS 2 +#define NFSD_ACL_NRVERS (sizeof(nfsd_acl_version)/sizeof(nfsd_acl_version[0])) +static struct svc_version *nfsd_acl_versions[NFSD_ACL_NRVERS]; + +static struct svc_program nfsd_acl_program = { + .pg_prog = NFS_ACL_PROGRAM, + .pg_nvers = NFSD_ACL_NRVERS, + .pg_vers = nfsd_acl_versions, + .pg_name = "nfsd", + .pg_class = "nfsd", + .pg_stats = &nfsd_acl_svcstats, + .pg_authenticate = &svc_set_client, +}; + +static struct svc_stat nfsd_acl_svcstats = { + .program = &nfsd_acl_program, +}; +#endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */ + static struct svc_version * nfsd_version[] = { [2] = &nfsd_version2, #if defined(CONFIG_NFSD_V3) @@ -79,6 +105,9 @@ static struct svc_version * nfsd_version[] = { static struct svc_version *nfsd_versions[NFSD_NRVERS]; struct svc_program nfsd_program = { +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) + .pg_next = &nfsd_acl_program, +#endif .pg_prog = NFS_PROGRAM, /* program number */ .pg_nvers = NFSD_NRVERS, /* nr of entries in nfsd_version */ .pg_vers = nfsd_versions, /* version table */ @@ -147,6 +176,26 @@ nfsd_svc(unsigned short port, int nrservs) nfsd_program.pg_vers[i] = nfsd_version[i]; } + +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) + found_one = 0; + + for (i = NFSD_ACL_MINVERS; i < NFSD_ACL_NRVERS; i++) { + if (NFSCTL_VERISSET(nfsd_versbits, i)) { + nfsd_acl_program.pg_vers[i] = + nfsd_acl_version[i]; + found_one = 1; + } else + nfsd_acl_program.pg_vers[i] = NULL; + } + + if (!found_one) { + for (i = NFSD_ACL_MINVERS; i < NFSD_ACL_NRVERS; i++) + nfsd_acl_program.pg_vers[i] = + nfsd_acl_version[i]; + } +#endif + atomic_set(&nfsd_busy, 0); error = -ENOMEM; nfsd_serv = svc_create(&nfsd_program, NFSD_BUFSIZE); @@ -411,30 +460,3 @@ nfsd_dispatch(struct svc_rqst *rqstp, u32 *statp) nfsd_cache_update(rqstp, proc->pc_cachetype, statp + 1); return 1; } - -#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) -static struct svc_stat nfsd_acl_svcstats; -static struct svc_version * nfsd_acl_version[] = { - [2] = &nfsd_acl_version2, - [3] = &nfsd_acl_version3, -}; - -#define NFSD_ACL_NRVERS (sizeof(nfsd_acl_version)/sizeof(nfsd_acl_version[0])) -static struct svc_program nfsd_acl_program = { - .pg_prog = NFS_ACL_PROGRAM, - .pg_nvers = NFSD_ACL_NRVERS, - .pg_vers = nfsd_acl_version, - .pg_name = "nfsd", - .pg_class = "nfsd", - .pg_stats = &nfsd_acl_svcstats, - .pg_authenticate = &svc_set_client, -}; - -static struct svc_stat nfsd_acl_svcstats = { - .program = &nfsd_acl_program, -}; - -#define nfsd_acl_program_p &nfsd_acl_program -#else -#define nfsd_acl_program_p NULL -#endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */ -- cgit v1.2.3 From caf736085f2f0d22a992a855d9caae14973f7ea4 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 1 Feb 2006 03:04:39 -0800 Subject: [PATCH] smbfs readdir vs signal fix An old patch designed to fix http://bugme.osdl.org/show_bug.cgi?id=4497, "getdents gives empty/random result upon signal". If smbfs's readdir() is interupted by a signal, smb_readdir() failed to noticed that and proceeded to treat the unread-into page as valid directory contents. Fix that up by handling the -ERESTARTSYS. Thanks to Stian Skjelstad for reporting and testing. Cc: Stian Skjelstad Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/smbfs/dir.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c index c6c33e15143..0424d06b147 100644 --- a/fs/smbfs/dir.c +++ b/fs/smbfs/dir.c @@ -209,6 +209,8 @@ init_cache: ctl.valid = 1; read_really: result = server->ops->readdir(filp, dirent, filldir, &ctl); + if (result == -ERESTARTSYS && page) + ClearPageUptodate(page); if (ctl.idx == -1) goto invalid_cache; /* retry */ ctl.head.end = ctl.fpos - 1; @@ -217,7 +219,8 @@ finished: if (page) { cache->head = ctl.head; kunmap(page); - SetPageUptodate(page); + if (result != -ERESTARTSYS) + SetPageUptodate(page); unlock_page(page); page_cache_release(page); } -- cgit v1.2.3 From 9cd684551124e71630ab96d238747051463f5b56 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 1 Feb 2006 03:04:40 -0800 Subject: [PATCH] fuse: fix async read for legacy filesystems While asynchronous reads mean a performance improvement in most cases, if the filesystem assumed that reads are synchronous, then async reads may degrade performance (filesystem may receive reads out of order, which can confuse it's own readahead logic). With sshfs a 1.5 to 4 times slowdown can be measured. There's also a need for userspace filesystems to know whether asynchronous reads are supported by the kernel or not. To achive these, negotiate in the INIT request whether async reads will be used and the maximum readahead value. Update interface version to 7.6 If userspace uses a version earlier than 7.6, then disable async reads, and set maximum readahead value to the maximum read size, as done in previous versions. Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/file.c | 9 +++++++-- fs/fuse/fuse_i.h | 3 +++ fs/fuse/inode.c | 14 ++++++++++++-- 3 files changed, 22 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a7ef5e716f3..296351615b0 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -335,9 +335,14 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file, loff_t pos = page_offset(req->pages[0]); size_t count = req->num_pages << PAGE_CACHE_SHIFT; req->out.page_zeroing = 1; - req->end = fuse_readpages_end; fuse_read_fill(req, file, inode, pos, count, FUSE_READ); - request_send_background(fc, req); + if (fc->async_read) { + req->end = fuse_readpages_end; + request_send_background(fc, req); + } else { + request_send(fc, req); + fuse_readpages_end(fc, req); + } } struct fuse_readpages_data { diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 46cf933aa3b..4a83adfec96 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -272,6 +272,9 @@ struct fuse_conn { reply, before any other request, and never cleared */ unsigned conn_error : 1; + /** Do readpages asynchronously? Only set in INIT */ + unsigned async_read : 1; + /* * The following bitfields are only for optimization purposes * and hence races in setting them will not cause malfunction diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index c755a0440a6..879e6fba948 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -473,6 +473,16 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION) fc->conn_error = 1; else { + unsigned long ra_pages; + + if (arg->minor >= 6) { + ra_pages = arg->max_readahead / PAGE_CACHE_SIZE; + if (arg->flags & FUSE_ASYNC_READ) + fc->async_read = 1; + } else + ra_pages = fc->max_read / PAGE_CACHE_SIZE; + + fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages); fc->minor = arg->minor; fc->max_write = arg->minor < 5 ? 4096 : arg->max_write; } @@ -496,6 +506,8 @@ static void fuse_send_init(struct fuse_conn *fc) arg->major = FUSE_KERNEL_VERSION; arg->minor = FUSE_KERNEL_MINOR_VERSION; + arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE; + arg->flags |= FUSE_ASYNC_READ; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); @@ -552,8 +564,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) fc->user_id = d.user_id; fc->group_id = d.group_id; fc->max_read = d.max_read; - if (fc->max_read / PAGE_CACHE_SIZE < fc->bdi.ra_pages) - fc->bdi.ra_pages = fc->max_read / PAGE_CACHE_SIZE; /* Used by get_root_inode() */ sb->s_fs_info = fc; -- cgit v1.2.3 From cb82a6cdf994d6656ad0a25ed28395af3416a27c Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 1 Feb 2006 03:04:49 -0800 Subject: [PATCH] compat_sys_pselect7() fix fs/compat.c: In function `compat_sys_pselect7': fs/compat.c:1820: warning: passing arg 5 of `compat_core_sys_select' from incompatible pointer type Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/compat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/compat.c b/fs/compat.c index ff0bafcff72..cc58a20df57 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1781,7 +1781,7 @@ asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp, { compat_sigset_t ss32; sigset_t ksigmask, sigsaved; - long timeout = MAX_SCHEDULE_TIMEOUT; + s64 timeout = MAX_SCHEDULE_TIMEOUT; struct compat_timespec ts; int ret; -- cgit v1.2.3 From 537421be79b94bcf620467f50dd9e38b739c2a00 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 1 Feb 2006 03:05:24 -0800 Subject: [PATCH] Mark CONFIG_UFS_FS_WRITE as BROKEN OpenBSD doesn't see "." correctly in directories created by Linux. Copying files over several KB will buy you infinite loop in __getblk_slow(). Copying files smaller than 1 KB seems to be OK. Sometimes files will be filled with zeros. Sometimes incorrectly copied file will reappear after next file with truncated size. Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index ef78e3a42d3..93b5dc4082f 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -1327,7 +1327,7 @@ config UFS_FS config UFS_FS_WRITE bool "UFS file system write support (DANGEROUS)" - depends on UFS_FS && EXPERIMENTAL + depends on UFS_FS && EXPERIMENTAL && BROKEN help Say Y here if you want to try writing to UFS partitions. This is experimental, so you should back up your UFS partitions beforehand. -- cgit v1.2.3 From 4e6a510a74145585f4111d60d1b5fd450d795dd8 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 1 Feb 2006 03:05:31 -0800 Subject: [PATCH] mm: hugepage accounting fix 2.6.15's hugepage faulting introduced huge_pages_needed accounting into hugetlbfs: to count how many pages are already in cache, for spot check on how far a new mapping may be allowed to extend the file. But it's muddled: each hugepage found covers HPAGE_SIZE, not PAGE_SIZE. Once pages were already in cache, it would overshoot, wrap its hugepages count backwards, and so fail a harmless repeat mapping with -ENOMEM. Fixes the problem found by Don Dupuis. Signed-off-by: Hugh Dickins Acked-By: Adam Litke Acked-by: William Irwin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index f568102da1e..b3519528994 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -72,8 +72,8 @@ huge_pages_needed(struct address_space *mapping, struct vm_area_struct *vma) unsigned long start = vma->vm_start; unsigned long end = vma->vm_end; unsigned long hugepages = (end - start) >> HPAGE_SHIFT; - pgoff_t next = vma->vm_pgoff; - pgoff_t endpg = next + ((end - start) >> PAGE_SHIFT); + pgoff_t next = vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT); + pgoff_t endpg = next + hugepages; pagevec_init(&pvec, 0); while (next < endpg) { -- cgit v1.2.3 From e965f9630c651fa4249039fd4b80c9392d07a856 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 1 Feb 2006 03:05:41 -0800 Subject: [PATCH] Direct Migration V9: Avoid writeback / page_migrate() method Migrate a page with buffers without requiring writeback This introduces a new address space operation migratepage() that may be used by a filesystem to implement its own version of page migration. A version is provided that migrates buffers attached to pages. Some filesystems (ext2, ext3, xfs) are modified to utilize this feature. The swapper address space operation are modified so that a regular migrate_page() will occur for anonymous pages without writeback (migrate_pages forces every anonymous page to have a swap entry). Signed-off-by: Mike Kravetz Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 60 +++++++++++++++++++++++++++++++++++++++++++++ fs/ext2/inode.c | 2 ++ fs/ext3/inode.c | 2 ++ fs/xfs/linux-2.6/xfs_aops.c | 1 + fs/xfs/linux-2.6/xfs_buf.c | 1 + 5 files changed, 66 insertions(+) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 3dc712f29d2..8bcbac87a28 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3049,6 +3049,66 @@ asmlinkage long sys_bdflush(int func, long data) return 0; } +/* + * Migration function for pages with buffers. This function can only be used + * if the underlying filesystem guarantees that no other references to "page" + * exist. + */ +#ifdef CONFIG_MIGRATION +int buffer_migrate_page(struct page *newpage, struct page *page) +{ + struct address_space *mapping = page->mapping; + struct buffer_head *bh, *head; + + if (!mapping) + return -EAGAIN; + + if (!page_has_buffers(page)) + return migrate_page(newpage, page); + + head = page_buffers(page); + + if (migrate_page_remove_references(newpage, page, 3)) + return -EAGAIN; + + bh = head; + do { + get_bh(bh); + lock_buffer(bh); + bh = bh->b_this_page; + + } while (bh != head); + + ClearPagePrivate(page); + set_page_private(newpage, page_private(page)); + set_page_private(page, 0); + put_page(page); + get_page(newpage); + + bh = head; + do { + set_bh_page(bh, newpage, bh_offset(bh)); + bh = bh->b_this_page; + + } while (bh != head); + + SetPagePrivate(newpage); + + migrate_page_copy(newpage, page); + + bh = head; + do { + unlock_buffer(bh); + put_bh(bh); + bh = bh->b_this_page; + + } while (bh != head); + + return 0; +} +EXPORT_SYMBOL(buffer_migrate_page); +#endif + /* * Buffer-head allocation */ diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index e7d3f0522d0..a717837f272 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -706,6 +706,7 @@ struct address_space_operations ext2_aops = { .bmap = ext2_bmap, .direct_IO = ext2_direct_IO, .writepages = ext2_writepages, + .migratepage = buffer_migrate_page, }; struct address_space_operations ext2_aops_xip = { @@ -723,6 +724,7 @@ struct address_space_operations ext2_nobh_aops = { .bmap = ext2_bmap, .direct_IO = ext2_direct_IO, .writepages = ext2_writepages, + .migratepage = buffer_migrate_page, }; /* diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 8824e84f8a5..3fc4238e970 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1559,6 +1559,7 @@ static struct address_space_operations ext3_ordered_aops = { .invalidatepage = ext3_invalidatepage, .releasepage = ext3_releasepage, .direct_IO = ext3_direct_IO, + .migratepage = buffer_migrate_page, }; static struct address_space_operations ext3_writeback_aops = { @@ -1572,6 +1573,7 @@ static struct address_space_operations ext3_writeback_aops = { .invalidatepage = ext3_invalidatepage, .releasepage = ext3_releasepage, .direct_IO = ext3_direct_IO, + .migratepage = buffer_migrate_page, }; static struct address_space_operations ext3_journalled_aops = { diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 12062678940..9892268e300 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -1462,4 +1462,5 @@ struct address_space_operations linvfs_aops = { .commit_write = generic_commit_write, .bmap = linvfs_bmap, .direct_IO = linvfs_direct_IO, + .migratepage = buffer_migrate_page, }; diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index a36a8e3b703..bfb4f2917bb 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -1521,6 +1521,7 @@ xfs_mapping_buftarg( struct address_space *mapping; static struct address_space_operations mapping_aops = { .sync_page = block_sync_page, + .migratepage = fail_migrate_page, }; inode = new_inode(bdev->bd_inode->i_sb); -- cgit v1.2.3 From d739b42b82773206297db1fc0d96ef895a5d9688 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Wed, 1 Feb 2006 03:06:43 -0800 Subject: [PATCH] reiserfs: remove kmalloc wrapper Remove kmalloc() wrapper from fs/reiserfs/. Please note that a reiserfs /proc entry format is changed because kmalloc statistics is removed. Signed-off-by: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/dir.c | 16 +++------- fs/reiserfs/fix_node.c | 50 +++--------------------------- fs/reiserfs/journal.c | 84 ++++++++++++++++---------------------------------- fs/reiserfs/namei.c | 16 +++++----- fs/reiserfs/procfs.c | 3 +- fs/reiserfs/super.c | 6 ---- fs/reiserfs/xattr.c | 11 +++---- 7 files changed, 49 insertions(+), 137 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c index 9dd71e80703..d71ac657928 100644 --- a/fs/reiserfs/dir.c +++ b/fs/reiserfs/dir.c @@ -150,18 +150,15 @@ static int reiserfs_readdir(struct file *filp, void *dirent, filldir_t filldir) if (d_reclen <= 32) { local_buf = small_buf; } else { - local_buf = - reiserfs_kmalloc(d_reclen, GFP_NOFS, - inode->i_sb); + local_buf = kmalloc(d_reclen, + GFP_NOFS); if (!local_buf) { pathrelse(&path_to_entry); ret = -ENOMEM; goto out; } if (item_moved(&tmp_ih, &path_to_entry)) { - reiserfs_kfree(local_buf, - d_reclen, - inode->i_sb); + kfree(local_buf); goto research; } } @@ -174,15 +171,12 @@ static int reiserfs_readdir(struct file *filp, void *dirent, filldir_t filldir) (dirent, local_buf, d_reclen, d_off, d_ino, DT_UNKNOWN) < 0) { if (local_buf != small_buf) { - reiserfs_kfree(local_buf, - d_reclen, - inode->i_sb); + kfree(local_buf); } goto end; } if (local_buf != small_buf) { - reiserfs_kfree(local_buf, d_reclen, - inode->i_sb); + kfree(local_buf); } // next entry should be looked for with such offset next_pos = deh_offset(deh) + 1; diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c index 45829889dcd..aa22588019e 100644 --- a/fs/reiserfs/fix_node.c +++ b/fs/reiserfs/fix_node.c @@ -2021,38 +2021,6 @@ static int get_neighbors(struct tree_balance *p_s_tb, int n_h) return CARRY_ON; } -#ifdef CONFIG_REISERFS_CHECK -void *reiserfs_kmalloc(size_t size, gfp_t flags, struct super_block *s) -{ - void *vp; - static size_t malloced; - - vp = kmalloc(size, flags); - if (vp) { - REISERFS_SB(s)->s_kmallocs += size; - if (REISERFS_SB(s)->s_kmallocs > malloced + 200000) { - reiserfs_warning(s, - "vs-8301: reiserfs_kmalloc: allocated memory %d", - REISERFS_SB(s)->s_kmallocs); - malloced = REISERFS_SB(s)->s_kmallocs; - } - } - return vp; -} - -void reiserfs_kfree(const void *vp, size_t size, struct super_block *s) -{ - kfree(vp); - - REISERFS_SB(s)->s_kmallocs -= size; - if (REISERFS_SB(s)->s_kmallocs < 0) - reiserfs_warning(s, - "vs-8302: reiserfs_kfree: allocated memory %d", - REISERFS_SB(s)->s_kmallocs); - -} -#endif - static int get_virtual_node_size(struct super_block *sb, struct buffer_head *bh) { int max_num_of_items; @@ -2086,7 +2054,7 @@ static int get_mem_for_virtual_node(struct tree_balance *tb) /* we have to allocate more memory for virtual node */ if (tb->vn_buf) { /* free memory allocated before */ - reiserfs_kfree(tb->vn_buf, tb->vn_buf_size, tb->tb_sb); + kfree(tb->vn_buf); /* this is not needed if kfree is atomic */ check_fs = 1; } @@ -2095,24 +2063,15 @@ static int get_mem_for_virtual_node(struct tree_balance *tb) tb->vn_buf_size = size; /* get memory for virtual item */ - buf = - reiserfs_kmalloc(size, GFP_ATOMIC | __GFP_NOWARN, - tb->tb_sb); + buf = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN); if (!buf) { /* getting memory with GFP_KERNEL priority may involve balancing now (due to indirect_to_direct conversion on dcache shrinking). So, release path and collected resources here */ free_buffers_in_tb(tb); - buf = reiserfs_kmalloc(size, GFP_NOFS, tb->tb_sb); + buf = kmalloc(size, GFP_NOFS); if (!buf) { -#ifdef CONFIG_REISERFS_CHECK - reiserfs_warning(tb->tb_sb, - "vs-8345: get_mem_for_virtual_node: " - "kmalloc failed. reiserfs kmalloced %d bytes", - REISERFS_SB(tb->tb_sb)-> - s_kmallocs); -#endif tb->vn_buf_size = 0; } tb->vn_buf = buf; @@ -2619,7 +2578,6 @@ void unfix_nodes(struct tree_balance *tb) } } - if (tb->vn_buf) - reiserfs_kfree(tb->vn_buf, tb->vn_buf_size, tb->tb_sb); + kfree(tb->vn_buf); } diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 4491fcf2a0e..16b526fd20b 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -152,18 +152,16 @@ static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block struct reiserfs_bitmap_node *bn; static int id; - bn = reiserfs_kmalloc(sizeof(struct reiserfs_bitmap_node), GFP_NOFS, - p_s_sb); + bn = kmalloc(sizeof(struct reiserfs_bitmap_node), GFP_NOFS); if (!bn) { return NULL; } - bn->data = reiserfs_kmalloc(p_s_sb->s_blocksize, GFP_NOFS, p_s_sb); + bn->data = kzalloc(p_s_sb->s_blocksize, GFP_NOFS); if (!bn->data) { - reiserfs_kfree(bn, sizeof(struct reiserfs_bitmap_node), p_s_sb); + kfree(bn); return NULL; } bn->id = id++; - memset(bn->data, 0, p_s_sb->s_blocksize); INIT_LIST_HEAD(&bn->list); return bn; } @@ -197,8 +195,8 @@ static inline void free_bitmap_node(struct super_block *p_s_sb, struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); journal->j_used_bitmap_nodes--; if (journal->j_free_bitmap_nodes > REISERFS_MAX_BITMAP_NODES) { - reiserfs_kfree(bn->data, p_s_sb->s_blocksize, p_s_sb); - reiserfs_kfree(bn, sizeof(struct reiserfs_bitmap_node), p_s_sb); + kfree(bn->data); + kfree(bn); } else { list_add(&bn->list, &journal->j_bitmap_nodes); journal->j_free_bitmap_nodes++; @@ -276,8 +274,8 @@ static int free_bitmap_nodes(struct super_block *p_s_sb) while (next != &journal->j_bitmap_nodes) { bn = list_entry(next, struct reiserfs_bitmap_node, list); list_del(next); - reiserfs_kfree(bn->data, p_s_sb->s_blocksize, p_s_sb); - reiserfs_kfree(bn, sizeof(struct reiserfs_bitmap_node), p_s_sb); + kfree(bn->data); + kfree(bn); next = journal->j_bitmap_nodes.next; journal->j_free_bitmap_nodes--; } @@ -581,7 +579,7 @@ static inline void put_journal_list(struct super_block *s, jl->j_trans_id, jl->j_refcount); } if (--jl->j_refcount == 0) - reiserfs_kfree(jl, sizeof(struct reiserfs_journal_list), s); + kfree(jl); } /* @@ -1818,8 +1816,7 @@ void remove_journal_hash(struct super_block *sb, static void free_journal_ram(struct super_block *p_s_sb) { struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); - reiserfs_kfree(journal->j_current_jl, - sizeof(struct reiserfs_journal_list), p_s_sb); + kfree(journal->j_current_jl); journal->j_num_lists--; vfree(journal->j_cnode_free_orig); @@ -2093,21 +2090,15 @@ static int journal_read_transaction(struct super_block *p_s_sb, } trans_id = get_desc_trans_id(desc); /* now we know we've got a good transaction, and it was inside the valid time ranges */ - log_blocks = - reiserfs_kmalloc(get_desc_trans_len(desc) * - sizeof(struct buffer_head *), GFP_NOFS, p_s_sb); - real_blocks = - reiserfs_kmalloc(get_desc_trans_len(desc) * - sizeof(struct buffer_head *), GFP_NOFS, p_s_sb); + log_blocks = kmalloc(get_desc_trans_len(desc) * + sizeof(struct buffer_head *), GFP_NOFS); + real_blocks = kmalloc(get_desc_trans_len(desc) * + sizeof(struct buffer_head *), GFP_NOFS); if (!log_blocks || !real_blocks) { brelse(c_bh); brelse(d_bh); - reiserfs_kfree(log_blocks, - get_desc_trans_len(desc) * - sizeof(struct buffer_head *), p_s_sb); - reiserfs_kfree(real_blocks, - get_desc_trans_len(desc) * - sizeof(struct buffer_head *), p_s_sb); + kfree(log_blocks); + kfree(real_blocks); reiserfs_warning(p_s_sb, "journal-1169: kmalloc failed, unable to mount FS"); return -1; @@ -2145,12 +2136,8 @@ static int journal_read_transaction(struct super_block *p_s_sb, brelse_array(real_blocks, i); brelse(c_bh); brelse(d_bh); - reiserfs_kfree(log_blocks, - get_desc_trans_len(desc) * - sizeof(struct buffer_head *), p_s_sb); - reiserfs_kfree(real_blocks, - get_desc_trans_len(desc) * - sizeof(struct buffer_head *), p_s_sb); + kfree(log_blocks); + kfree(real_blocks); return -1; } } @@ -2166,12 +2153,8 @@ static int journal_read_transaction(struct super_block *p_s_sb, brelse_array(real_blocks, get_desc_trans_len(desc)); brelse(c_bh); brelse(d_bh); - reiserfs_kfree(log_blocks, - get_desc_trans_len(desc) * - sizeof(struct buffer_head *), p_s_sb); - reiserfs_kfree(real_blocks, - get_desc_trans_len(desc) * - sizeof(struct buffer_head *), p_s_sb); + kfree(log_blocks); + kfree(real_blocks); return -1; } memcpy(real_blocks[i]->b_data, log_blocks[i]->b_data, @@ -2193,12 +2176,8 @@ static int journal_read_transaction(struct super_block *p_s_sb, get_desc_trans_len(desc) - i); brelse(c_bh); brelse(d_bh); - reiserfs_kfree(log_blocks, - get_desc_trans_len(desc) * - sizeof(struct buffer_head *), p_s_sb); - reiserfs_kfree(real_blocks, - get_desc_trans_len(desc) * - sizeof(struct buffer_head *), p_s_sb); + kfree(log_blocks); + kfree(real_blocks); return -1; } brelse(real_blocks[i]); @@ -2217,12 +2196,8 @@ static int journal_read_transaction(struct super_block *p_s_sb, journal->j_trans_id = trans_id + 1; brelse(c_bh); brelse(d_bh); - reiserfs_kfree(log_blocks, - le32_to_cpu(desc->j_len) * sizeof(struct buffer_head *), - p_s_sb); - reiserfs_kfree(real_blocks, - le32_to_cpu(desc->j_len) * sizeof(struct buffer_head *), - p_s_sb); + kfree(log_blocks); + kfree(real_blocks); return 0; } @@ -2472,13 +2447,11 @@ static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s) { struct reiserfs_journal_list *jl; retry: - jl = reiserfs_kmalloc(sizeof(struct reiserfs_journal_list), GFP_NOFS, - s); + jl = kzalloc(sizeof(struct reiserfs_journal_list), GFP_NOFS); if (!jl) { yield(); goto retry; } - memset(jl, 0, sizeof(*jl)); INIT_LIST_HEAD(&jl->j_list); INIT_LIST_HEAD(&jl->j_working_list); INIT_LIST_HEAD(&jl->j_tail_bh_list); @@ -3042,14 +3015,12 @@ struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct } return th; } - th = reiserfs_kmalloc(sizeof(struct reiserfs_transaction_handle), - GFP_NOFS, s); + th = kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS); if (!th) return NULL; ret = journal_begin(th, s, nblocks); if (ret) { - reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle), - s); + kfree(th); return NULL; } @@ -3067,8 +3038,7 @@ int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th) ret = -EIO; if (th->t_refcount == 0) { SB_JOURNAL(s)->j_persistent_trans--; - reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle), - s); + kfree(th); } return ret; } diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 8f8d8d01107..c8123308e06 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -456,7 +456,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, /* get memory for composing the entry */ buflen = DEH_SIZE + ROUND_UP(namelen); if (buflen > sizeof(small_buf)) { - buffer = reiserfs_kmalloc(buflen, GFP_NOFS, dir->i_sb); + buffer = kmalloc(buflen, GFP_NOFS); if (buffer == 0) return -ENOMEM; } else @@ -490,7 +490,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, retval = reiserfs_find_entry(dir, name, namelen, &path, &de); if (retval != NAME_NOT_FOUND) { if (buffer != small_buf) - reiserfs_kfree(buffer, buflen, dir->i_sb); + kfree(buffer); pathrelse(&path); if (retval == IO_ERROR) { @@ -515,7 +515,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, reiserfs_warning(dir->i_sb, "reiserfs_add_entry: Congratulations! we have got hash function screwed up"); if (buffer != small_buf) - reiserfs_kfree(buffer, buflen, dir->i_sb); + kfree(buffer); pathrelse(&path); return -EBUSY; } @@ -535,7 +535,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, &entry_key); if (buffer != small_buf) - reiserfs_kfree(buffer, buflen, dir->i_sb); + kfree(buffer); pathrelse(&path); return -EBUSY; } @@ -546,7 +546,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, reiserfs_paste_into_item(th, &path, &entry_key, dir, buffer, paste_size); if (buffer != small_buf) - reiserfs_kfree(buffer, buflen, dir->i_sb); + kfree(buffer); if (retval) { reiserfs_check_path(&path); return retval; @@ -1065,7 +1065,7 @@ static int reiserfs_symlink(struct inode *parent_dir, goto out_failed; } - name = reiserfs_kmalloc(item_len, GFP_NOFS, parent_dir->i_sb); + name = kmalloc(item_len, GFP_NOFS); if (!name) { drop_new_inode(inode); retval = -ENOMEM; @@ -1079,14 +1079,14 @@ static int reiserfs_symlink(struct inode *parent_dir, retval = journal_begin(&th, parent_dir->i_sb, jbegin_count); if (retval) { drop_new_inode(inode); - reiserfs_kfree(name, item_len, parent_dir->i_sb); + kfree(name); goto out_failed; } retval = reiserfs_new_inode(&th, parent_dir, mode, name, strlen(symname), dentry, inode); - reiserfs_kfree(name, item_len, parent_dir->i_sb); + kfree(name); if (retval) { /* reiserfs_new_inode iputs for us */ goto out_failed; } diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index fc2f43c75df..ef6caed9336 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -88,7 +88,6 @@ static int show_super(struct seq_file *m, struct super_block *sb) seq_printf(m, "state: \t%s\n" "mount options: \t%s%s%s%s%s%s%s%s%s%s%s\n" "gen. counter: \t%i\n" - "s_kmallocs: \t%i\n" "s_disk_reads: \t%i\n" "s_disk_writes: \t%i\n" "s_fix_nodes: \t%i\n" @@ -128,7 +127,7 @@ static int show_super(struct seq_file *m, struct super_block *sb) "SMALL_TAILS " : "NO_TAILS ", replay_only(sb) ? "REPLAY_ONLY " : "", convert_reiserfs(sb) ? "CONV " : "", - atomic_read(&r->s_generation_counter), SF(s_kmallocs), + atomic_read(&r->s_generation_counter), SF(s_disk_reads), SF(s_disk_writes), SF(s_fix_nodes), SF(s_do_balance), SF(s_unneeded_left_neighbor), SF(s_good_search_by_key_reada), SF(s_bmaps), diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 397d9590c8f..77891de0e02 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -472,12 +472,6 @@ static void reiserfs_put_super(struct super_block *s) print_statistics(s); - if (REISERFS_SB(s)->s_kmallocs != 0) { - reiserfs_warning(s, - "vs-2004: reiserfs_put_super: allocated memory left %d", - REISERFS_SB(s)->s_kmallocs); - } - if (REISERFS_SB(s)->reserved_blocks != 0) { reiserfs_warning(s, "green-2005: reiserfs_put_super: reserved blocks left %d", diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index cc061bfd437..4f0db4e5451 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -368,15 +368,13 @@ static int __xattr_readdir(struct file *filp, void *dirent, filldir_t filldir) if (d_reclen <= 32) { local_buf = small_buf; } else { - local_buf = - reiserfs_kmalloc(d_reclen, GFP_NOFS, inode->i_sb); + local_buf = kmalloc(d_reclen, GFP_NOFS); if (!local_buf) { pathrelse(&path_to_entry); return -ENOMEM; } if (item_moved(&tmp_ih, &path_to_entry)) { - reiserfs_kfree(local_buf, d_reclen, - inode->i_sb); + kfree(local_buf); /* sigh, must retry. Do this same offset again */ next_pos = d_off; @@ -399,13 +397,12 @@ static int __xattr_readdir(struct file *filp, void *dirent, filldir_t filldir) if (filldir(dirent, local_buf, d_reclen, d_off, d_ino, DT_UNKNOWN) < 0) { if (local_buf != small_buf) { - reiserfs_kfree(local_buf, d_reclen, - inode->i_sb); + kfree(local_buf); } goto end; } if (local_buf != small_buf) { - reiserfs_kfree(local_buf, d_reclen, inode->i_sb); + kfree(local_buf); } } /* while */ -- cgit v1.2.3 From 8c777cc4be1390862d053cbc002246e87572147b Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Wed, 1 Feb 2006 03:06:43 -0800 Subject: [PATCH] reiserfs: use __GFP_NOFAIL instead of yield and retry loop for allocation This patch replaces yield and retry loop with __GFP_NOFAIL in alloc_journal_list(). Signed-off-by: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/journal.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 16b526fd20b..2d04efb22ee 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2446,12 +2446,8 @@ static int journal_read(struct super_block *p_s_sb) static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s) { struct reiserfs_journal_list *jl; - retry: - jl = kzalloc(sizeof(struct reiserfs_journal_list), GFP_NOFS); - if (!jl) { - yield(); - goto retry; - } + jl = kzalloc(sizeof(struct reiserfs_journal_list), + GFP_NOFS | __GFP_NOFAIL); INIT_LIST_HEAD(&jl->j_list); INIT_LIST_HEAD(&jl->j_working_list); INIT_LIST_HEAD(&jl->j_tail_bh_list); -- cgit v1.2.3 From e5dd259f78ba0fd0c7bfc5c52179dbbff3eb48aa Mon Sep 17 00:00:00 2001 From: Diego Calleja Date: Wed, 1 Feb 2006 03:06:44 -0800 Subject: [PATCH] reiserfs: missing kmalloc failure check According to http://bugzilla.kernel.org/show_bug.cgi?id=5778 fs/reiserfs/file.c is missing this check. Signed-off-by: Diego Calleja Cc: Jeff Mahoney Cc: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/file.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index ad6fa964b0e..1cad5d066a5 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -192,6 +192,8 @@ static int reiserfs_allocate_blocks_for_region(struct reiserfs_transaction_handl allocated_blocks = kmalloc((blocks_to_allocate + will_prealloc) * sizeof(b_blocknr_t), GFP_NOFS); + if (!allocated_blocks) + return -ENOMEM; /* First we compose a key to point at the writing position, we want to do that outside of any locking region. */ -- cgit v1.2.3 From c87d0c07ea198db1ce451421904edd60b7d385ee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 1 Feb 2006 03:06:45 -0800 Subject: [PATCH] reiserfs: remove reiserfs_permission_locked This function is completely unused since the xattr permission checking changes. Remove it and fold __reiserfs_permission into reiserfs_permission. Signed-off-by: Christoph Hellwig Cc: Jeff Mahoney Cc: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/xattr.c | 30 ++++++++---------------------- 1 file changed, 8 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 4f0db4e5451..2f085845f67 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -1319,9 +1319,7 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags) return err; } -static int -__reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd, - int need_lock) +int reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd) { umode_t mode = inode->i_mode; @@ -1357,15 +1355,14 @@ __reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd, if (!(mode & S_IRWXG)) goto check_groups; - if (need_lock) { - reiserfs_read_lock_xattr_i(inode); - reiserfs_read_lock_xattrs(inode->i_sb); - } + reiserfs_read_lock_xattr_i(inode); + reiserfs_read_lock_xattrs(inode->i_sb); + acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS); - if (need_lock) { - reiserfs_read_unlock_xattrs(inode->i_sb); - reiserfs_read_unlock_xattr_i(inode); - } + + reiserfs_read_unlock_xattrs(inode->i_sb); + reiserfs_read_unlock_xattr_i(inode); + if (IS_ERR(acl)) { if (PTR_ERR(acl) == -ENODATA) goto check_groups; @@ -1414,14 +1411,3 @@ __reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd, return -EACCES; } - -int reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd) -{ - return __reiserfs_permission(inode, mask, nd, 1); -} - -int -reiserfs_permission_locked(struct inode *inode, int mask, struct nameidata *nd) -{ - return __reiserfs_permission(inode, mask, nd, 0); -} -- cgit v1.2.3 From ec191574b9c3cb7bfb95e4f803b63f7c8dc52690 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 1 Feb 2006 03:06:46 -0800 Subject: [PATCH] reiserfs: use generic_permission Use the generic_permission code with a proper wrapper and callback instead of having a local copy. Signed-off-by: Christoph Hellwig Cc: Jeff Mahoney Cc: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/xattr.c | 103 +++++++++++++--------------------------------------- 1 file changed, 26 insertions(+), 77 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 2f085845f67..ffb79c48c5b 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -1319,95 +1319,44 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags) return err; } -int reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd) +static int reiserfs_check_acl(struct inode *inode, int mask) { - umode_t mode = inode->i_mode; - - if (mask & MAY_WRITE) { - /* - * Nobody gets write access to a read-only fs. - */ - if (IS_RDONLY(inode) && - (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) - return -EROFS; - - /* - * Nobody gets write access to an immutable file. - */ - if (IS_IMMUTABLE(inode)) - return -EACCES; - } - - /* We don't do permission checks on the internal objects. - * Permissions are determined by the "owning" object. */ - if (is_reiserfs_priv_object(inode)) - return 0; - - if (current->fsuid == inode->i_uid) { - mode >>= 6; -#ifdef CONFIG_REISERFS_FS_POSIX_ACL - } else if (reiserfs_posixacl(inode->i_sb) && - get_inode_sd_version(inode) != STAT_DATA_V1) { - struct posix_acl *acl; - - /* ACL can't contain additional permissions if - the ACL_MASK entry is 0 */ - if (!(mode & S_IRWXG)) - goto check_groups; + struct posix_acl *acl; + int error = -EAGAIN; /* do regular unix permission checks by default */ - reiserfs_read_lock_xattr_i(inode); - reiserfs_read_lock_xattrs(inode->i_sb); - - acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS); + reiserfs_read_lock_xattr_i(inode); + reiserfs_read_lock_xattrs(inode->i_sb); - reiserfs_read_unlock_xattrs(inode->i_sb); - reiserfs_read_unlock_xattr_i(inode); + acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS); - if (IS_ERR(acl)) { - if (PTR_ERR(acl) == -ENODATA) - goto check_groups; - return PTR_ERR(acl); - } + reiserfs_read_unlock_xattrs(inode->i_sb); + reiserfs_read_unlock_xattr_i(inode); - if (acl) { - int err = posix_acl_permission(inode, acl, mask); + if (acl) { + if (!IS_ERR(acl)) { + error = posix_acl_permission(inode, acl, mask); posix_acl_release(acl); - if (err == -EACCES) { - goto check_capabilities; - } - return err; - } else { - goto check_groups; - } -#endif - } else { - check_groups: - if (in_group_p(inode->i_gid)) - mode >>= 3; + } else if (PTR_ERR(acl) != -ENODATA) + error = PTR_ERR(acl); } - /* - * If the DACs are ok we don't need any capability check. - */ - if (((mode & mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == mask)) - return 0; + return error; +} - check_capabilities: +int reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd) +{ /* - * Read/write DACs are always overridable. - * Executable DACs are overridable if at least one exec bit is set. + * We don't do permission checks on the internal objects. + * Permissions are determined by the "owning" object. */ - if (!(mask & MAY_EXEC) || - (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode)) - if (capable(CAP_DAC_OVERRIDE)) - return 0; + if (is_reiserfs_priv_object(inode)) + return 0; /* - * Searching includes executable on directories, else just read. + * Stat data v1 doesn't support ACLs. */ - if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))) - if (capable(CAP_DAC_READ_SEARCH)) - return 0; - - return -EACCES; + if (get_inode_sd_version(inode) == STAT_DATA_V1) + return generic_permission(inode, mask, NULL); + else + return generic_permission(inode, mask, reiserfs_check_acl); } -- cgit v1.2.3 From d62b1b87a7d1c3a21dddabed4251763090be3182 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 1 Feb 2006 03:06:47 -0800 Subject: [PATCH] resierfs: fix reiserfs_invalidatepage race against data=ordered After a transaction has closed but before it has finished commit, there is a window where data=ordered mode requires invalidatepage to pin pages instead of freeing them. This patch fixes a race between the invalidatepage checks and data=ordered writeback, and it also adds a check to the reiserfs write_ordered_buffers routines to write any anonymous buffers that were dirtied after its first writeback loop. That bug works like this: proc1: transaction closes and a new one starts proc1: write_ordered_buffers starts processing data=ordered list proc1: buffer A is cleaned and written proc2: buffer A is dirtied by another process proc2: File is truncated to zero, page A goes through invalidatepage proc2: reiserfs_invalidatepage sees dirty buffer A with reiserfs journal head, pins it proc1: write_ordered_buffers frees the journal head on buffer A At this point, buffer A stays dirty forever Signed-off-by: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/inode.c | 4 +++- fs/reiserfs/journal.c | 13 +++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index ffa34b861bd..60e2f234470 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2743,6 +2743,7 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh) int ret = 1; struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb); + lock_buffer(bh); spin_lock(&j->j_dirty_buffers_lock); if (!buffer_mapped(bh)) { goto free_jh; @@ -2758,7 +2759,7 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh) if (buffer_journaled(bh) || buffer_journal_dirty(bh)) { ret = 0; } - } else if (buffer_dirty(bh) || buffer_locked(bh)) { + } else if (buffer_dirty(bh)) { struct reiserfs_journal_list *jl; struct reiserfs_jh *jh = bh->b_private; @@ -2784,6 +2785,7 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh) reiserfs_free_jh(bh); } spin_unlock(&j->j_dirty_buffers_lock); + unlock_buffer(bh); return ret; } diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 2d04efb22ee..bc8fe963b3c 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -877,6 +877,19 @@ static int write_ordered_buffers(spinlock_t * lock, if (!buffer_uptodate(bh)) { ret = -EIO; } + /* ugly interaction with invalidatepage here. + * reiserfs_invalidate_page will pin any buffer that has a valid + * journal head from an older transaction. If someone else sets + * our buffer dirty after we write it in the first loop, and + * then someone truncates the page away, nobody will ever write + * the buffer. We're safe if we write the page one last time + * after freeing the journal header. + */ + if (buffer_dirty(bh) && unlikely(bh->b_page->mapping == NULL)) { + spin_unlock(lock); + ll_rw_block(WRITE, 1, &bh); + spin_lock(lock); + } put_bh(bh); cond_resched_lock(lock); } -- cgit v1.2.3 From fc5cd582e9c934ddaf6f310179488932cd154794 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 1 Feb 2006 03:06:48 -0800 Subject: [PATCH] reiserfs: zero b_private when allocating buffer heads The b_private field in buffer heads needs to be zero filled when the buffers are allocated. Thanks to Nathan Scott for finding this. It was causing problems on systems with both XFS and reiserfs. Signed-off-by: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 8bcbac87a28..5e4a90ee103 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1022,6 +1022,7 @@ try_again: bh->b_state = 0; atomic_set(&bh->b_count, 0); + bh->b_private = NULL; bh->b_size = size; /* Link the buffer to its page */ -- cgit v1.2.3 From e0e851cf30f1a9bd2e2a7624e9810378d6a2b072 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 1 Feb 2006 03:06:49 -0800 Subject: [PATCH] reiserfs: reiserfs hang and performance fix for data=journal mode In data=journal mode, reiserfs writepage needs to make sure not to trigger transactions while being run under PF_MEMALLOC. This patch makes sure to redirty the page instead of forcing a transaction start in this case. Also, calling filemap_fdata* in order to trigger io on the block device can cause lock inversions on the page lock. Instead, do simple batching from flush_commit_list. Signed-off-by: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/inode.c | 7 +++++++ fs/reiserfs/journal.c | 19 ++++++++++++++----- 2 files changed, 21 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 60e2f234470..b33d67bba2f 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2363,6 +2363,13 @@ static int reiserfs_write_full_page(struct page *page, int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize; th.t_trans_id = 0; + /* no logging allowed when nonblocking or from PF_MEMALLOC */ + if (checked && (current->flags & PF_MEMALLOC)) { + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + /* The page dirty bit is cleared before writepage is called, which * means we have to tell create_empty_buffers to make dirty buffers * The page really should be up to date at this point, so tossing diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index bc8fe963b3c..1b2402a9a8e 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -988,6 +988,7 @@ static int flush_commit_list(struct super_block *s, struct reiserfs_journal *journal = SB_JOURNAL(s); int barrier = 0; int retval = 0; + int write_len; reiserfs_check_lock_depth(s, "flush_commit_list"); @@ -1037,16 +1038,24 @@ static int flush_commit_list(struct super_block *s, BUG_ON(!list_empty(&jl->j_bh_list)); /* * for the description block and all the log blocks, submit any buffers - * that haven't already reached the disk + * that haven't already reached the disk. Try to write at least 256 + * log blocks. later on, we will only wait on blocks that correspond + * to this transaction, but while we're unplugging we might as well + * get a chunk of data on there. */ atomic_inc(&journal->j_async_throttle); - for (i = 0; i < (jl->j_len + 1); i++) { + write_len = jl->j_len + 1; + if (write_len < 256) + write_len = 256; + for (i = 0 ; i < write_len ; i++) { bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s); tbh = journal_find_get_block(s, bn); - if (buffer_dirty(tbh)) /* redundant, ll_rw_block() checks */ - ll_rw_block(SWRITE, 1, &tbh); - put_bh(tbh); + if (tbh) { + if (buffer_dirty(tbh)) + ll_rw_block(WRITE, 1, &tbh) ; + put_bh(tbh) ; + } } atomic_dec(&journal->j_async_throttle); -- cgit v1.2.3 From 3d4492f81dd7b486f1be0616a1ce7f73760f406e Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 1 Feb 2006 03:06:49 -0800 Subject: [PATCH] reiserfs: reiserfs write_ordered_buffers should not oops on dirty non-uptodate bh write_ordered_buffers should handle dirty non-uptodate buffers without a BUG() Signed-off-by: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/journal.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 1b2402a9a8e..47c9f432dea 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -846,6 +846,14 @@ static int write_ordered_buffers(spinlock_t * lock, spin_lock(lock); goto loop_next; } + /* in theory, dirty non-uptodate buffers should never get here, + * but the upper layer io error paths still have a few quirks. + * Handle them here as gracefully as we can + */ + if (!buffer_uptodate(bh) && buffer_dirty(bh)) { + clear_buffer_dirty(bh); + ret = -EIO; + } if (buffer_dirty(bh)) { list_del_init(&jh->list); list_add(&jh->list, &tmp); @@ -1030,9 +1038,12 @@ static int flush_commit_list(struct super_block *s, } if (!list_empty(&jl->j_bh_list)) { + int ret; unlock_kernel(); - write_ordered_buffers(&journal->j_dirty_buffers_lock, - journal, jl, &jl->j_bh_list); + ret = write_ordered_buffers(&journal->j_dirty_buffers_lock, + journal, jl, &jl->j_bh_list); + if (ret < 0 && retval == 0) + retval = ret; lock_kernel(); } BUG_ON(!list_empty(&jl->j_bh_list)); -- cgit v1.2.3 From 6ae1ea447d21c4fecf5df8d0e1022461274fb4e8 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 1 Feb 2006 03:06:50 -0800 Subject: [PATCH] reiserfs: reiserfs fix journal accounting in journal_transaction_should_end reiserfs: journal_transaction_should_end should increase the count of blocks allocated so the transaction subsystem can keep new writers from creating a transaction that is too large. Signed-off-by: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/journal.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 47c9f432dea..b7a179560ab 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2823,6 +2823,9 @@ int journal_transaction_should_end(struct reiserfs_transaction_handle *th, journal->j_cnode_free < (journal->j_trans_max * 3)) { return 1; } + /* protected by the BKL here */ + journal->j_len_alloc += new_alloc; + th->t_blocks_allocated += new_alloc ; return 0; } -- cgit v1.2.3 From fa385bef256077f3b820b241e8f3755ef3905b74 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Wed, 1 Feb 2006 03:06:51 -0800 Subject: [PATCH] reiserfs: reiserfs: check for files > 2GB on 3.5.x disks When a filesystem has been converted from 3.5.x to 3.6.x, we need an extra check during file write to make sure we are not trying to make a 3.5.x file > 2GB. Signed-off-by: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/file.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'fs') diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 1cad5d066a5..f3473176c83 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -1287,6 +1287,23 @@ static ssize_t reiserfs_file_write(struct file *file, /* the file we are going t struct reiserfs_transaction_handle th; th.t_trans_id = 0; + /* If a filesystem is converted from 3.5 to 3.6, we'll have v3.5 items + * lying around (most of the disk, in fact). Despite the filesystem + * now being a v3.6 format, the old items still can't support large + * file sizes. Catch this case here, as the rest of the VFS layer is + * oblivious to the different limitations between old and new items. + * reiserfs_setattr catches this for truncates. This chunk is lifted + * from generic_write_checks. */ + if (get_inode_item_key_version (inode) == KEY_FORMAT_3_5 && + *ppos + count > MAX_NON_LFS) { + if (*ppos >= MAX_NON_LFS) { + send_sig(SIGXFSZ, current, 0); + return -EFBIG; + } + if (count > MAX_NON_LFS - (unsigned long)*ppos) + count = MAX_NON_LFS - (unsigned long)*ppos; + } + if (file->f_flags & O_DIRECT) { // Direct IO needs treatment ssize_t result, after_file_end = 0; if ((*ppos + count >= inode->i_size) -- cgit v1.2.3 From 7045f37b17ffa6e85435ca980122b46a74caa7e4 Mon Sep 17 00:00:00 2001 From: Martin Waitz Date: Wed, 1 Feb 2006 03:06:57 -0800 Subject: [PATCH] DocBook: fix some kernel-doc comments in fs and block Update some parameter descriptions to actually match the code. Signed-off-by: Martin Waitz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/inode.c | 2 +- fs/namei.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/inode.c b/fs/inode.c index 108138d4e90..d0be6159eb7 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1179,7 +1179,7 @@ EXPORT_SYMBOL(bmap); /** * touch_atime - update the access time * @mnt: mount the inode is accessed on - * @inode: inode accessed + * @dentry: dentry accessed * * Update the accessed time on an inode and mark it for writeback. * This function automatically handles read only file systems and media, diff --git a/fs/namei.c b/fs/namei.c index 4acdac043b6..7ac9fb4acb2 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1161,6 +1161,7 @@ static int __path_lookup_intent_open(int dfd, const char *name, /** * path_lookup_open - lookup a file path with open intent + * @dfd: the directory to use as base, or AT_FDCWD * @name: pointer to file name * @lookup_flags: lookup intent flags * @nd: pointer to nameidata @@ -1175,6 +1176,7 @@ int path_lookup_open(int dfd, const char *name, unsigned int lookup_flags, /** * path_lookup_create - lookup a file path with open + create intent + * @dfd: the directory to use as base, or AT_FDCWD * @name: pointer to file name * @lookup_flags: lookup intent flags * @nd: pointer to nameidata -- cgit v1.2.3 From 16fb24252a8170799e7adf14d8fc31b817fcaf53 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 1 Feb 2006 12:18:22 -0500 Subject: NLM: Fix arguments to NLM_CANCEL call The OpenGroup docs state that the arguments "block", "exclusive" and "alock" must exactly match the arguments for the lock call that we are trying to cancel. Currently, "block" is always set to false, which is wrong. See bug# 5956 on bugzilla.kernel.org. Signed-off-by: Trond Myklebust --- fs/lockd/clntproc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 14552403957..b8ecfa1168f 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -28,6 +28,7 @@ static int nlmclnt_lock(struct nlm_rqst *, struct file_lock *); static int nlmclnt_unlock(struct nlm_rqst *, struct file_lock *); static int nlm_stat_to_errno(u32 stat); static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *host); +static int nlmclnt_cancel(struct nlm_host *, int , struct file_lock *); static const struct rpc_call_ops nlmclnt_unlock_ops; static const struct rpc_call_ops nlmclnt_cancel_ops; @@ -598,7 +599,7 @@ out_unblock: nlmclnt_finish_block(req); /* Cancel the blocked request if it is still pending */ if (resp->status == NLM_LCK_BLOCKED) - nlmclnt_cancel(host, fl); + nlmclnt_cancel(host, req->a_args.block, fl); out: nlmclnt_release_lockargs(req); return status; @@ -728,8 +729,7 @@ static const struct rpc_call_ops nlmclnt_unlock_ops = { * We always use an async RPC call for this in order not to hang a * process that has been Ctrl-C'ed. */ -int -nlmclnt_cancel(struct nlm_host *host, struct file_lock *fl) +static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl) { struct nlm_rqst *req; unsigned long flags; @@ -750,6 +750,7 @@ nlmclnt_cancel(struct nlm_host *host, struct file_lock *fl) req->a_flags = RPC_TASK_ASYNC; nlmclnt_setlockargs(req, fl); + req->a_args.block = block; status = nlmclnt_async_call(req, NLMPROC_CANCEL, &nlmclnt_cancel_ops); if (status < 0) { -- cgit v1.2.3 From aaaa99423b4b1f9cfd33ea5643d9274c25f62491 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 1 Feb 2006 12:18:25 -0500 Subject: NLM: Ensure that nlmclnt_cancel_callback() doesn't loop forever If the server returns NLM_LCK_DENIED_NOLOCKS, we currently retry the entire NLM_CANCEL request. This may end up looping forever unless the server changes its mind (why would it do that, though?). Ensure that we limit the number of retries (to 3). See bug# 5957 in bugzilla.kernel.org. Signed-off-by: Trond Myklebust --- fs/lockd/clntproc.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index b8ecfa1168f..220058d8616 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -22,6 +22,7 @@ #define NLMDBG_FACILITY NLMDBG_CLIENT #define NLMCLNT_GRACE_WAIT (5*HZ) #define NLMCLNT_POLL_TIMEOUT (30*HZ) +#define NLMCLNT_MAX_RETRIES 3 static int nlmclnt_test(struct nlm_rqst *, struct file_lock *); static int nlmclnt_lock(struct nlm_rqst *, struct file_lock *); @@ -802,6 +803,9 @@ die: return; retry_cancel: + /* Don't ever retry more than 3 times */ + if (req->a_retries++ >= NLMCLNT_MAX_RETRIES) + goto die; nlm_rebind_host(req->a_host); rpc_restart_call(task); rpc_delay(task, 30 * HZ); -- cgit v1.2.3 From 1935245655996ca4d14e687c3a100d2e2bbdc78d Mon Sep 17 00:00:00 2001 From: Dirk Mueller Date: Wed, 1 Feb 2006 12:19:47 -0500 Subject: NFSv3: fix sync_retry in direct i/o NFS Only do a sync_retry if the memcmp failed. Signed-off-by: Dirk Mueller Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 10ae377e68f..04ab2fc360e 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -481,7 +481,7 @@ retry: if (wdata->verf.committed != NFS_FILE_SYNC) { need_commit = 1; if (memcmp(&first_verf.verifier, &wdata->verf.verifier, - sizeof(first_verf.verifier))); + sizeof(first_verf.verifier))) goto sync_retry; } -- cgit v1.2.3 From 9ad11ab48b1ad618bf47076e9e579f267f5306c2 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Thu, 2 Feb 2006 16:11:51 +1100 Subject: [PATCH] compat: fix compat_sys_openat and friends Most of the 64 bit architectures will zero extend the first argument to compat_sys_{openat,newfstatat,futimesat} which will fail if the 32 bit syscall was passed AT_FDCWD (which is a small negative number). Declare the first argument to be an unsigned int which will force the correct sign extension when the internal functions are called in each case. Also, do some small white space cleanups in fs/compat.c. Signed-off-by: Stephen Rothwell Acked-by: David S. Miller Signed-off-by: Linus Torvalds --- fs/compat.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/compat.c b/fs/compat.c index cc58a20df57..70c5af4cc27 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -73,17 +73,17 @@ asmlinkage long compat_sys_utime(char __user *filename, struct compat_utimbuf __ return do_utimes(AT_FDCWD, filename, t ? tv : NULL); } -asmlinkage long compat_sys_futimesat(int dfd, char __user *filename, struct compat_timeval __user *t) +asmlinkage long compat_sys_futimesat(unsigned int dfd, char __user *filename, struct compat_timeval __user *t) { struct timeval tv[2]; - if (t) { + if (t) { if (get_user(tv[0].tv_sec, &t[0].tv_sec) || get_user(tv[0].tv_usec, &t[0].tv_usec) || get_user(tv[1].tv_sec, &t[1].tv_sec) || get_user(tv[1].tv_usec, &t[1].tv_usec)) - return -EFAULT; - } + return -EFAULT; + } return do_utimes(dfd, filename, t ? tv : NULL); } @@ -114,7 +114,7 @@ asmlinkage long compat_sys_newlstat(char __user * filename, return error; } -asmlinkage long compat_sys_newfstatat(int dfd, char __user *filename, +asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user *filename, struct compat_stat __user *statbuf, int flag) { struct kstat stat; @@ -1326,7 +1326,7 @@ compat_sys_open(const char __user *filename, int flags, int mode) * O_LARGEFILE flag. */ asmlinkage long -compat_sys_openat(int dfd, const char __user *filename, int flags, int mode) +compat_sys_openat(unsigned int dfd, const char __user *filename, int flags, int mode) { return do_sys_open(dfd, filename, flags, mode); } -- cgit v1.2.3 From d35c602870ece3166cff3d25fbc687a7f707acf3 Mon Sep 17 00:00:00 2001 From: Vitaly Fertman Date: Fri, 3 Feb 2006 03:04:01 -0800 Subject: [PATCH] someone broke reiserfs V3 mount options, this fixes it Signed-off-by: Hans Reiser Signed-off-by: Vitaly Fertman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 77891de0e02..ef5e5414e7a 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1125,7 +1125,7 @@ static void handle_attrs(struct super_block *s) REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS); } } else if (le32_to_cpu(rs->s_flags) & reiserfs_attrs_cleared) { - REISERFS_SB(s)->s_mount_opt |= REISERFS_ATTRS; + REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_ATTRS); } } -- cgit v1.2.3 From 8c17e1eb05977283bc7ad94d16ace3a0d586921a Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 3 Feb 2006 03:04:03 -0800 Subject: [PATCH] quota_v2: printk warning fixes fs/quota_v2.c: In function `v2_check_quota_file': fs/quota_v2.c:39: warning: int format, different type arg (arg 2) fs/quota_v2.c:39: warning: int format, different type arg (arg 3) Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/quota_v2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/quota_v2.c b/fs/quota_v2.c index a4ef91bb4f3..b4199ec3ece 100644 --- a/fs/quota_v2.c +++ b/fs/quota_v2.c @@ -35,7 +35,7 @@ static int v2_check_quota_file(struct super_block *sb, int type) size = sb->s_op->quota_read(sb, type, (char *)&dqhead, sizeof(struct v2_disk_dqheader), 0); if (size != sizeof(struct v2_disk_dqheader)) { - printk("quota_v2: failed read expected=%d got=%d\n", + printk("quota_v2: failed read expected=%zd got=%zd\n", sizeof(struct v2_disk_dqheader), size); return 0; } -- cgit v1.2.3 From e295cfcb2907ae4c5df57f5d4ada1ce6f3ae4657 Mon Sep 17 00:00:00 2001 From: Evgeniy Dushistov Date: Fri, 3 Feb 2006 03:04:04 -0800 Subject: [PATCH] ufs: fix oops with `ufs1' type "rm" command, on file system with "ufs1" type cause system hang up. This is, in fact, not so bad as it seems to be, because of after that in "kernel control path" there are 3-4 places which may cause "oops". So the first patch fix oopses, and the second patch fix "kernel hang up". "oops" appears because of reading of group's summary info partly wrong, and access to not first group's summary info cause "oops". Signed-off-by: Evgeniy Dushistov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ufs/super.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ufs/super.c b/fs/ufs/super.c index d4aacee593f..e9055ef7f5a 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -388,7 +388,8 @@ static int ufs_parse_options (char * options, unsigned * mount_options) /* * Read on-disk structures associated with cylinder groups */ -static int ufs_read_cylinder_structures (struct super_block *sb) { +static int ufs_read_cylinder_structures (struct super_block *sb) +{ struct ufs_sb_info * sbi = UFS_SB(sb); struct ufs_sb_private_info * uspi; struct ufs_super_block *usb; @@ -415,6 +416,7 @@ static int ufs_read_cylinder_structures (struct super_block *sb) { base = space = kmalloc(size, GFP_KERNEL); if (!base) goto failed; + sbi->s_csp = (struct ufs_csum *)space; for (i = 0; i < blks; i += uspi->s_fpb) { size = uspi->s_bsize; if (i + uspi->s_fpb > blks) @@ -430,7 +432,6 @@ static int ufs_read_cylinder_structures (struct super_block *sb) { goto failed; ubh_ubhcpymem (space, ubh, size); - sbi->s_csp[ufs_fragstoblks(i)]=(struct ufs_csum *)space; space += size; ubh_brelse (ubh); @@ -486,7 +487,8 @@ failed: * Put on-disk structures associated with cylinder groups and * write them back to disk */ -static void ufs_put_cylinder_structures (struct super_block *sb) { +static void ufs_put_cylinder_structures (struct super_block *sb) +{ struct ufs_sb_info * sbi = UFS_SB(sb); struct ufs_sb_private_info * uspi; struct ufs_buffer_head * ubh; @@ -499,7 +501,7 @@ static void ufs_put_cylinder_structures (struct super_block *sb) { size = uspi->s_cssize; blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift; - base = space = (char*) sbi->s_csp[0]; + base = space = (char*) sbi->s_csp; for (i = 0; i < blks; i += uspi->s_fpb) { size = uspi->s_bsize; if (i + uspi->s_fpb > blks) -- cgit v1.2.3 From 09114eb8c53d2d3b2ff9523e011cb68b2e245dce Mon Sep 17 00:00:00 2001 From: Evgeniy Dushistov Date: Fri, 3 Feb 2006 03:04:06 -0800 Subject: [PATCH] ufs: fix hang during `rm' This fixes the code like this: bh = sb_find_get_block (sb, tmp + j); if ((bh && DATA_BUFFER_USED(bh)) || tmp != fs32_to_cpu(sb, *p)) { retry = 1; brelse (bh); goto next1; } bforget (bh); sb_find_get_block() ordinarily returns a buffer_head with b_count>=2, and this code assume that in case if "b_count>1" buffer is used, so this caused infinite loop. (akpm: that is-the-buffer-busy code is incomprehensible. Good riddance. Use of block_truncate_page() seems sane). Signed-off-by: Evgeniy Dushistov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ufs/inode.c | 2 +- fs/ufs/truncate.c | 72 +++++++++++++------------------------------------------ 2 files changed, 17 insertions(+), 57 deletions(-) (limited to 'fs') diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index e0c04e36a05..3c3f62ce2ad 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -376,7 +376,7 @@ out: * This function gets the block which contains the fragment. */ -static int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create) +int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create) { struct super_block * sb = inode->i_sb; struct ufs_sb_private_info * uspi = UFS_SB(sb)->s_uspi; diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c index 61d2e35012a..02e86291ef8 100644 --- a/fs/ufs/truncate.c +++ b/fs/ufs/truncate.c @@ -29,6 +29,11 @@ * Idea from Pierre del Perugia */ +/* + * Modified to avoid infinite loop on 2006 by + * Evgeniy Dushistov + */ + #include #include #include @@ -65,19 +70,16 @@ #define DIRECT_BLOCK ((inode->i_size + uspi->s_bsize - 1) >> uspi->s_bshift) #define DIRECT_FRAGMENT ((inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift) -#define DATA_BUFFER_USED(bh) \ - (atomic_read(&bh->b_count)>1 || buffer_locked(bh)) static int ufs_trunc_direct (struct inode * inode) { struct ufs_inode_info *ufsi = UFS_I(inode); struct super_block * sb; struct ufs_sb_private_info * uspi; - struct buffer_head * bh; __fs32 * p; unsigned frag1, frag2, frag3, frag4, block1, block2; unsigned frag_to_free, free_count; - unsigned i, j, tmp; + unsigned i, tmp; int retry; UFSD(("ENTER\n")) @@ -117,15 +119,7 @@ static int ufs_trunc_direct (struct inode * inode) ufs_panic (sb, "ufs_trunc_direct", "internal error"); frag1 = ufs_fragnum (frag1); frag2 = ufs_fragnum (frag2); - for (j = frag1; j < frag2; j++) { - bh = sb_find_get_block (sb, tmp + j); - if ((bh && DATA_BUFFER_USED(bh)) || tmp != fs32_to_cpu(sb, *p)) { - retry = 1; - brelse (bh); - goto next1; - } - bforget (bh); - } + inode->i_blocks -= (frag2-frag1) << uspi->s_nspfshift; mark_inode_dirty(inode); ufs_free_fragments (inode, tmp + frag1, frag2 - frag1); @@ -140,15 +134,7 @@ next1: tmp = fs32_to_cpu(sb, *p); if (!tmp) continue; - for (j = 0; j < uspi->s_fpb; j++) { - bh = sb_find_get_block(sb, tmp + j); - if ((bh && DATA_BUFFER_USED(bh)) || tmp != fs32_to_cpu(sb, *p)) { - retry = 1; - brelse (bh); - goto next2; - } - bforget (bh); - } + *p = 0; inode->i_blocks -= uspi->s_nspb; mark_inode_dirty(inode); @@ -162,7 +148,6 @@ next1: frag_to_free = tmp; free_count = uspi->s_fpb; } -next2:; } if (free_count > 0) @@ -179,15 +164,7 @@ next2:; if (!tmp ) ufs_panic(sb, "ufs_truncate_direct", "internal error"); frag4 = ufs_fragnum (frag4); - for (j = 0; j < frag4; j++) { - bh = sb_find_get_block (sb, tmp + j); - if ((bh && DATA_BUFFER_USED(bh)) || tmp != fs32_to_cpu(sb, *p)) { - retry = 1; - brelse (bh); - goto next1; - } - bforget (bh); - } + *p = 0; inode->i_blocks -= frag4 << uspi->s_nspfshift; mark_inode_dirty(inode); @@ -204,9 +181,8 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p) struct super_block * sb; struct ufs_sb_private_info * uspi; struct ufs_buffer_head * ind_ubh; - struct buffer_head * bh; __fs32 * ind; - unsigned indirect_block, i, j, tmp; + unsigned indirect_block, i, tmp; unsigned frag_to_free, free_count; int retry; @@ -238,15 +214,7 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p) tmp = fs32_to_cpu(sb, *ind); if (!tmp) continue; - for (j = 0; j < uspi->s_fpb; j++) { - bh = sb_find_get_block(sb, tmp + j); - if ((bh && DATA_BUFFER_USED(bh)) || tmp != fs32_to_cpu(sb, *ind)) { - retry = 1; - brelse (bh); - goto next; - } - bforget (bh); - } + *ind = 0; ubh_mark_buffer_dirty(ind_ubh); if (free_count == 0) { @@ -261,7 +229,6 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p) } inode->i_blocks -= uspi->s_nspb; mark_inode_dirty(inode); -next:; } if (free_count > 0) { @@ -430,9 +397,7 @@ void ufs_truncate (struct inode * inode) struct ufs_inode_info *ufsi = UFS_I(inode); struct super_block * sb; struct ufs_sb_private_info * uspi; - struct buffer_head * bh; - unsigned offset; - int err, retry; + int retry; UFSD(("ENTER\n")) sb = inode->i_sb; @@ -442,6 +407,9 @@ void ufs_truncate (struct inode * inode) return; if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return; + + block_truncate_page(inode->i_mapping, inode->i_size, ufs_getfrag_block); + lock_kernel(); while (1) { retry = ufs_trunc_direct(inode); @@ -457,15 +425,7 @@ void ufs_truncate (struct inode * inode) blk_run_address_space(inode->i_mapping); yield(); } - offset = inode->i_size & uspi->s_fshift; - if (offset) { - bh = ufs_bread (inode, inode->i_size >> uspi->s_fshift, 0, &err); - if (bh) { - memset (bh->b_data + offset, 0, uspi->s_fsize - offset); - mark_buffer_dirty (bh); - brelse (bh); - } - } + inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; ufsi->i_lastfrag = DIRECT_FRAGMENT; unlock_kernel(); -- cgit v1.2.3 From 47ba87e0b1269698801310bfd1716b0538282405 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Fri, 3 Feb 2006 03:04:06 -0800 Subject: [PATCH] make "struct d_cookie" depend on CONFIG_PROFILING Shrinks "struct dentry" from 128 bytes to 124 on x86, allowing 31 objects per slab instead of 30. Cc: John Levon Cc: Philippe Elie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dcache.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 86bdb93789c..a173bba3266 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -743,7 +743,9 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) dentry->d_op = NULL; dentry->d_fsdata = NULL; dentry->d_mounted = 0; +#ifdef CONFIG_PROFILING dentry->d_cookie = NULL; +#endif INIT_HLIST_NODE(&dentry->d_hash); INIT_LIST_HEAD(&dentry->d_lru); INIT_LIST_HEAD(&dentry->d_subdirs); -- cgit v1.2.3 From dfa08592ca0440d793ecc8dfc6277dd2aa4b8dda Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Fri, 3 Feb 2006 03:04:13 -0800 Subject: [PATCH] Fix two ext[23] uninitialized warnings There is a code path that passed size to ext2_xattr_set (ext3_xattr_set_handle) before initializing it. The callees don't use the value in that case, but gcc cannot tell. Always initialize size to get rid of the warnings. Signed-off-by: Andreas Gruenbacher Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext2/acl.c | 2 +- fs/ext3/acl.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index 35acc43b897..da52b4a5db6 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -220,7 +220,7 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl) struct ext2_inode_info *ei = EXT2_I(inode); int name_index; void *value = NULL; - size_t size; + size_t size = 0; int error; if (S_ISLNK(inode->i_mode)) diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index 47a9da2dfb4..0d21d558b87 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -226,7 +226,7 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type, struct ext3_inode_info *ei = EXT3_I(inode); int name_index; void *value = NULL; - size_t size; + size_t size = 0; int error; if (S_ISLNK(inode->i_mode)) -- cgit v1.2.3 From bd3bfeb58aeddb660dc600ded2fa9243e0c2d12b Mon Sep 17 00:00:00 2001 From: Felix Oxley Date: Fri, 3 Feb 2006 03:04:15 -0800 Subject: [PATCH] fs/jffs/intrep.c: 255 is unsigned char Signed-off-by: Felix Oxley Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/jffs/intrep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jffs/intrep.c b/fs/jffs/intrep.c index b2e95421d93..ce7b54b0b2b 100644 --- a/fs/jffs/intrep.c +++ b/fs/jffs/intrep.c @@ -1965,7 +1965,7 @@ retry: iovec_cnt++; if (JFFS_GET_PAD_BYTES(raw_inode->nsize)) { - static char allff[3]={255,255,255}; + static unsigned char allff[3]={255,255,255}; /* Add some extra padding if necessary */ node_iovec[iovec_cnt].iov_base = allff; node_iovec[iovec_cnt].iov_len = -- cgit v1.2.3 From 93c615feffbcea4f09ecee154f46062f6041776e Mon Sep 17 00:00:00 2001 From: Latchesar Ionkov Date: Fri, 3 Feb 2006 03:04:17 -0800 Subject: [PATCH] v9fs: symlink support fixes Two symlink fixes, v9fs_readlink didn't copy the last character of the symlink name, v9fs_vfs_follow_link incorrectly called strlen of newly allocated buffer instead of PATH_MAX. Signed-off-by: Latchesar Ionkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/9p/vfs_inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 91f552454c7..63e5b0398e8 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -886,8 +886,8 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen) } /* copy extension buffer into buffer */ - if (fcall->params.rstat.stat.extension.len < buflen) - buflen = fcall->params.rstat.stat.extension.len; + if (fcall->params.rstat.stat.extension.len+1 < buflen) + buflen = fcall->params.rstat.stat.extension.len + 1; memcpy(buffer, fcall->params.rstat.stat.extension.str, buflen - 1); buffer[buflen-1] = 0; @@ -951,7 +951,7 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd) if (!link) link = ERR_PTR(-ENOMEM); else { - len = v9fs_readlink(dentry, link, strlen(link)); + len = v9fs_readlink(dentry, link, PATH_MAX); if (len < 0) { __putname(link); -- cgit v1.2.3 From 05818a004a84951fd383694f3b35d89eb49fa308 Mon Sep 17 00:00:00 2001 From: Latchesar Ionkov Date: Fri, 3 Feb 2006 03:04:18 -0800 Subject: [PATCH] v9fs: v9fs_put_str fix v9fs_put_str used to store pointer to the source string, instead of the cbuf copy. This patch corrects it. Signed-off-by: Latchesar Ionkov Cc: Eric Van Hensbergen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/9p/conv.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/9p/conv.c b/fs/9p/conv.c index 32a9f99154e..bf1f1006796 100644 --- a/fs/9p/conv.c +++ b/fs/9p/conv.c @@ -116,13 +116,19 @@ static void buf_put_int64(struct cbuf *buf, u64 val) } } -static void buf_put_stringn(struct cbuf *buf, const char *s, u16 slen) +static char *buf_put_stringn(struct cbuf *buf, const char *s, u16 slen) { + char *ret; + + ret = NULL; if (buf_check_size(buf, slen + 2)) { buf_put_int16(buf, slen); + ret = buf->p; memcpy(buf->p, s, slen); buf->p += slen; } + + return ret; } static inline void buf_put_string(struct cbuf *buf, const char *s) @@ -430,15 +436,19 @@ static inline void v9fs_put_int64(struct cbuf *bufp, u64 val, u64 * p) static void v9fs_put_str(struct cbuf *bufp, char *data, struct v9fs_str *str) { - if (data) { - str->len = strlen(data); - str->str = bufp->p; - } else { - str->len = 0; - str->str = NULL; - } + int len; + char *s; + + if (data) + len = strlen(data); + else + len = 0; - buf_put_stringn(bufp, data, str->len); + s = buf_put_stringn(bufp, data, len); + if (str) { + str->len = len; + str->str = s; + } } static int -- cgit v1.2.3 From 034b91a3b66cf9d2983ac45f73162395c0936c36 Mon Sep 17 00:00:00 2001 From: Latchesar Ionkov Date: Fri, 3 Feb 2006 03:04:20 -0800 Subject: [PATCH] v9fs: fix corner cases when flushing request When v9fs_mux_rpc sends a 9P message, it may be put in the queue of unsent request. If the user process receives a signal, v9fs_mux_rpc sets the request error to ERREQFLUSH and assigns NULL to request's send message. If the message was still in the unsent queue, v9fs_write_work would produce an oops while processing it. The patch makes sure that requests that are being flushed are moved to the pending requests queue safely. If a request is being flushed, don't remove it from the list of pending requests even if it receives a reply before the flush is acknoledged. The request will be removed during from the Rflush handler (v9fs_mux_flush_cb). Signed-off-by: Latchesar Ionkov Cc: Eric Van Hensbergen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/9p/mux.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/9p/mux.c b/fs/9p/mux.c index 945cb368d45..ea1134eb47c 100644 --- a/fs/9p/mux.c +++ b/fs/9p/mux.c @@ -471,10 +471,13 @@ static void v9fs_write_work(void *a) } spin_lock(&m->lock); - req = - list_entry(m->unsent_req_list.next, struct v9fs_req, +again: + req = list_entry(m->unsent_req_list.next, struct v9fs_req, req_list); list_move_tail(&req->req_list, &m->req_list); + if (req->err == ERREQFLUSH) + goto again; + m->wbuf = req->tcall->sdata; m->wsize = req->tcall->size; m->wpos = 0; @@ -525,7 +528,7 @@ static void process_request(struct v9fs_mux_data *m, struct v9fs_req *req) struct v9fs_str *ename; tag = req->tag; - if (req->rcall->id == RERROR && !req->err) { + if (!req->err && req->rcall->id == RERROR) { ecode = req->rcall->params.rerror.errno; ename = &req->rcall->params.rerror.error; @@ -551,7 +554,10 @@ static void process_request(struct v9fs_mux_data *m, struct v9fs_req *req) req->err = -EIO; } - if (req->cb && req->err != ERREQFLUSH) { + if (req->err == ERREQFLUSH) + return; + + if (req->cb) { dprintk(DEBUG_MUX, "calling callback tcall %p rcall %p\n", req->tcall, req->rcall); @@ -812,6 +818,7 @@ v9fs_mux_rpc_cb(void *a, struct v9fs_fcall *tc, struct v9fs_fcall *rc, int err) struct v9fs_mux_rpc *r; if (err == ERREQFLUSH) { + kfree(rc); dprintk(DEBUG_MUX, "err req flush\n"); return; } -- cgit v1.2.3 From a18546110ed6bec483d55bfffccb2487dfbd77af Mon Sep 17 00:00:00 2001 From: "schwab@suse.de" Date: Fri, 3 Feb 2006 03:04:24 -0800 Subject: [PATCH] disable per cpu intr in /proc/stat Don't compute and display the per-irq sums on ia64 either, too much overhead for mostly useless figures. Cc: Olaf Hering Acked-by: "Luck, Tony" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_misc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 8f8014285a3..1d24fead51a 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -548,7 +548,7 @@ static int show_stat(struct seq_file *p, void *v) } seq_printf(p, "intr %llu", (unsigned long long)sum); -#if !defined(CONFIG_PPC64) && !defined(CONFIG_ALPHA) +#if !defined(CONFIG_PPC64) && !defined(CONFIG_ALPHA) && !defined(CONFIG_IA64) for (i = 0; i < NR_IRQS; i++) seq_printf(p, " %u", kstat_irqs(i)); #endif -- cgit v1.2.3 From 835417967c10b6dfaffdffddba59196196e5d431 Mon Sep 17 00:00:00 2001 From: Carsten Otte Date: Fri, 3 Feb 2006 03:04:25 -0800 Subject: [PATCH] ext2: print xip mount option in ext2_show_options In case we have CONFIG_FS_XIP, ext2_show_options shows "xip" if EXT2_MOUNT_XIP mount flag is set. Signed-off-by: Carsten Otte Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext2/super.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 8d6819846fc..cb6f9bd658d 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -221,6 +221,11 @@ static int ext2_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",grpquota"); #endif +#if defined(CONFIG_EXT2_FS_XIP) + if (sbi->s_mount_opt & EXT2_MOUNT_XIP) + seq_puts(seq, ",xip"); +#endif + return 0; } -- cgit v1.2.3 From 35dc8161d0a6fa5e654bcb3d6240acc9ecb0a259 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Fri, 3 Feb 2006 03:04:27 -0800 Subject: [PATCH] fix O_DIRECT read of last block in a sparse file Currently, if you open a file O_DIRECT, truncate it to a size that is not a multiple of the disk block size, and then try to read the last block in the file, the read will return 0. The problem is in do_direct_IO, here: /* Handle holes */ if (!buffer_mapped(map_bh)) { char *kaddr; ... if (dio->block_in_file >= i_size_read(dio->inode)>>blkbits) { /* We hit eof */ page_cache_release(page); goto out; } We shift off any remaining bytes in the final block of the I/O, resulting in a 0-sized read. I've attached a patch that fixes this. I'm not happy about how ugly the math is getting, so suggestions are more than welcome. I've tested this with a simple program that performs the steps outlined for reproducing the problem above. Without the patch, we get a 0-sized result from read. With the patch, we get the correct return value from the short read. Signed-off-by: Jeff Moyer Cc: Badari Pulavarty Cc: Suparna Bhattacharya Cc: Mingming Cao Cc: Joel Becker Cc: "Chen, Kenneth W" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/direct-io.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/direct-io.c b/fs/direct-io.c index 30dbbd1df51..848044af7e1 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -857,6 +857,7 @@ do_holes: /* Handle holes */ if (!buffer_mapped(map_bh)) { char *kaddr; + loff_t i_size_aligned; /* AKPM: eargh, -ENOTBLK is a hack */ if (dio->rw == WRITE) { @@ -864,8 +865,14 @@ do_holes: return -ENOTBLK; } + /* + * Be sure to account for a partial block as the + * last block in the file + */ + i_size_aligned = ALIGN(i_size_read(dio->inode), + 1 << blkbits); if (dio->block_in_file >= - i_size_read(dio->inode)>>blkbits) { + i_size_aligned >> blkbits) { /* We hit eof */ page_cache_release(page); goto out; -- cgit v1.2.3 From 7d95c8f27d9be65bf160f1edaf653d33dfceb58c Mon Sep 17 00:00:00 2001 From: dean gaudet Date: Fri, 3 Feb 2006 03:04:30 -0800 Subject: [PATCH] fcntl F_SETFL and read-only IS_APPEND files There is code in setfl() which attempts to preserve the O_APPEND flag on IS_APPEND files... however IS_APPEND files could also be opened O_RDONLY and in that case setfl() should not require O_APPEND... coreutils 5.93 tail -f attempts to set O_NONBLOCK even on regular files... unfortunately if you try this on an append-only log file the result is this: fcntl64(3, F_GETFL) = 0x8000 (flags O_RDONLY|O_LARGEFILE) fcntl64(3, F_SETFL, O_RDONLY|O_NONBLOCK|O_LARGEFILE) = -1 EPERM (Operation not permitted) I offer up the patch below as one way of fixing the problem... i've tested it fixes the problem with tail -f but haven't really tested beyond that. (I also reported the coreutils bug upstream... it shouldn't fail imho... ) Signed-off-by: dean gaudet Cc: Al Viro Acked-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fcntl.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fcntl.c b/fs/fcntl.c index 5f96786d1c7..dc4a7007f4e 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -208,8 +208,11 @@ static int setfl(int fd, struct file * filp, unsigned long arg) struct inode * inode = filp->f_dentry->d_inode; int error = 0; - /* O_APPEND cannot be cleared if the file is marked as append-only */ - if (!(arg & O_APPEND) && IS_APPEND(inode)) + /* + * O_APPEND cannot be cleared if the file is marked as append-only + * and the file is open for write. + */ + if (((arg ^ filp->f_flags) & O_APPEND) && IS_APPEND(inode)) return -EPERM; /* O_NOATIME can only be set by the owner or superuser */ -- cgit v1.2.3 From 9d9c0531c91755a90b646b27bb722d59ee3eb46d Mon Sep 17 00:00:00 2001 From: Herbert Poetzl Date: Fri, 3 Feb 2006 03:04:37 -0800 Subject: [PATCH] quota: fix error code for ext2_new_inode() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The quota check in ext2_new_inode() returns ENOSPC where it should return EDQUOT instead. Signed-off-by: Herbert Pötzl Acked-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext2/ialloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 74714af4ae6..e52765219e1 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -605,7 +605,7 @@ got: insert_inode_hash(inode); if (DQUOT_ALLOC_INODE(inode)) { - err = -ENOSPC; + err = -EDQUOT; goto fail_drop; } -- cgit v1.2.3 From 5b00226d4d3aa7969d84e16f857ea100465d9c98 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Fri, 3 Feb 2006 03:04:42 -0800 Subject: [PATCH] fat: Replace an own implementation with ll_rw_block(SWRITE,) This patch replaces an own implementation with LL_RW_BLOCK(SWRITE,) which was newly added. Signed-off-by: OGAWA Hirofumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fat/misc.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/fat/misc.c b/fs/fat/misc.c index 32fb0a3f1da..944652e9dde 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -196,19 +196,9 @@ EXPORT_SYMBOL_GPL(fat_date_unix2dos); int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs) { - int i, e, err = 0; + int i, err = 0; - for (i = 0; i < nr_bhs; i++) { - lock_buffer(bhs[i]); - if (test_clear_buffer_dirty(bhs[i])) { - get_bh(bhs[i]); - bhs[i]->b_end_io = end_buffer_write_sync; - e = submit_bh(WRITE, bhs[i]); - if (!err && e) - err = e; - } else - unlock_buffer(bhs[i]); - } + ll_rw_block(SWRITE, nr_bhs, bhs); for (i = 0; i < nr_bhs; i++) { wait_on_buffer(bhs[i]); if (buffer_eopnotsupp(bhs[i])) { -- cgit v1.2.3 From e60e5c50aa5389db86e96fc52d02bc7db3d23f4a Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Fri, 3 Feb 2006 03:04:43 -0800 Subject: [PATCH] Trivial optimization of ll_rw_block() The ll_rw_block() needs to get ref-count only if it submits a buffer(). This patch avoids the needless get/put of ref-count. Signed-off-by: OGAWA Hirofumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 5e4a90ee103..62cfd17dc5f 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2867,22 +2867,22 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) else if (test_set_buffer_locked(bh)) continue; - get_bh(bh); if (rw == WRITE || rw == SWRITE) { if (test_clear_buffer_dirty(bh)) { bh->b_end_io = end_buffer_write_sync; + get_bh(bh); submit_bh(WRITE, bh); continue; } } else { if (!buffer_uptodate(bh)) { bh->b_end_io = end_buffer_read_sync; + get_bh(bh); submit_bh(rw, bh); continue; } } unlock_buffer(bh); - put_bh(bh); } } -- cgit v1.2.3 From 3b641407a1447759ac8159180e90ed2e4387a0b6 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Fri, 3 Feb 2006 03:04:44 -0800 Subject: [PATCH] fat: Fix truncate() write ordering The truncate() should write the file size before writing the new EOF entry. This patch fixes it. This bug was pointed out by Machida Hiroyuki. Signed-off-by: OGAWA Hirofumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fat/file.c | 50 +++++++++++++++++++++++--------------------------- 1 file changed, 23 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/fat/file.c b/fs/fat/file.c index e99c5a73b39..88aa1ae13f9 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -210,10 +210,30 @@ static int fat_free(struct inode *inode, int skip) if (MSDOS_I(inode)->i_start == 0) return 0; - /* - * Write a new EOF, and get the remaining cluster chain for freeing. - */ + fat_cache_inval_inode(inode); + wait = IS_DIRSYNC(inode); + i_start = free_start = MSDOS_I(inode)->i_start; + i_logstart = MSDOS_I(inode)->i_logstart; + + /* First, we write the new file size. */ + if (!skip) { + MSDOS_I(inode)->i_start = 0; + MSDOS_I(inode)->i_logstart = 0; + } + MSDOS_I(inode)->i_attrs |= ATTR_ARCH; + inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; + if (wait) { + err = fat_sync_inode(inode); + if (err) { + MSDOS_I(inode)->i_start = i_start; + MSDOS_I(inode)->i_logstart = i_logstart; + return err; + } + } else + mark_inode_dirty(inode); + + /* Write a new EOF, and get the remaining cluster chain for freeing. */ if (skip) { struct fat_entry fatent; int ret, fclus, dclus; @@ -244,35 +264,11 @@ static int fat_free(struct inode *inode, int skip) return ret; free_start = ret; - i_start = i_logstart = 0; - fat_cache_inval_inode(inode); - } else { - fat_cache_inval_inode(inode); - - i_start = free_start = MSDOS_I(inode)->i_start; - i_logstart = MSDOS_I(inode)->i_logstart; - MSDOS_I(inode)->i_start = 0; - MSDOS_I(inode)->i_logstart = 0; } - MSDOS_I(inode)->i_attrs |= ATTR_ARCH; - inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; - if (wait) { - err = fat_sync_inode(inode); - if (err) - goto error; - } else - mark_inode_dirty(inode); inode->i_blocks = skip << (MSDOS_SB(sb)->cluster_bits - 9); /* Freeing the remained cluster chain */ return fat_free_clusters(inode, free_start); - -error: - if (i_start) { - MSDOS_I(inode)->i_start = i_start; - MSDOS_I(inode)->i_logstart = i_logstart; - } - return err; } void fat_truncate(struct inode *inode) -- cgit v1.2.3 From 7656f328f68b351a8bb71ad465cedc8d0a039f9e Mon Sep 17 00:00:00 2001 From: Vincent Hanquez Date: Fri, 3 Feb 2006 03:04:48 -0800 Subject: [PATCH] debugfs: hard link count wrong Fix incorrect nlink of root inode for filesystems that use simple_fill_super(). Signed-off-by: Vincent Hanquez Cc: Greg KH Cc: Heiko Carstens Cc: Al Viro Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/libfs.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/libfs.c b/fs/libfs.c index 63c020e6589..71fd08fa410 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -388,6 +388,7 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; + inode->i_nlink = 2; root = d_alloc_root(inode); if (!root) { iput(inode); -- cgit v1.2.3 From 99603966f5b44693901ea68cef2c1c21ce6a49c3 Mon Sep 17 00:00:00 2001 From: "KAMBAROV, ZAUR" Date: Fri, 3 Feb 2006 03:04:49 -0800 Subject: [PATCH] coverity: udf/balloc.c null deref fix It's doing if (obh) else dereference obh So presumably `obh' is never null in there. This defect was found automatically by Coverity Prevent, a static analysis tool. Signed-off-by: Zaur Kambarov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/udf/balloc.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index 4fae57d9d11..201049ac8a9 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -579,10 +579,9 @@ static void udf_table_free_blocks(struct super_block * sb, { loffset = nextoffset; aed->lengthAllocDescs = cpu_to_le32(adsize); - if (obh) - sptr = UDF_I_DATA(inode) + nextoffset - udf_file_entry_alloc_offset(inode) + UDF_I_LENEATTR(inode) - adsize; - else - sptr = obh->b_data + nextoffset - adsize; + sptr = UDF_I_DATA(inode) + nextoffset - + udf_file_entry_alloc_offset(inode) + + UDF_I_LENEATTR(inode) - adsize; dptr = nbh->b_data + sizeof(struct allocExtDesc); memcpy(dptr, sptr, adsize); nextoffset = sizeof(struct allocExtDesc) + adsize; -- cgit v1.2.3 From db9a369ec172c8251dbc6f7bf6bf13f6c5b6e7f5 Mon Sep 17 00:00:00 2001 From: Jayachandran C Date: Fri, 3 Feb 2006 03:04:50 -0800 Subject: [PATCH] UDF: Fix issues reported by Coverity in namei.c This patch fixes an issue in fs/udf/namei.c reported by Coverity: Error reported(1776) CID: 1776 Checker: UNUSED_VALUE (help) File: fs/udf/namei.c Function: udf_lookup Description: Pointer returned from "udf_find_entry" is never used Patch description: remove unused variable fi. Signed-off-by: Jayachandran C. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/udf/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/udf/namei.c b/fs/udf/namei.c index ca732e79c48..ab9a7629d23 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -296,7 +296,7 @@ static struct dentry * udf_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { struct inode *inode = NULL; - struct fileIdentDesc cfi, *fi; + struct fileIdentDesc cfi; struct udf_fileident_bh fibh; if (dentry->d_name.len > UDF_NAME_LEN-2) @@ -318,7 +318,7 @@ udf_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) else #endif /* UDF_RECOVERY */ - if ((fi = udf_find_entry(dir, dentry, &fibh, &cfi))) + if (udf_find_entry(dir, dentry, &fibh, &cfi)) { if (fibh.sbh != fibh.ebh) udf_release_data(fibh.ebh); -- cgit v1.2.3 From 8c5a950c9693aa24828d16dd7bc38bced3f37d48 Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Fri, 6 Jan 2006 13:46:31 -0800 Subject: o Remove confusing Kconfig text for CONFIGFS_FS. Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/Kconfig | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index 93b5dc4082f..e9749b0eecd 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -883,8 +883,6 @@ config CONFIGFS_FS Both sysfs and configfs can and should exist together on the same system. One is not a replacement for the other. - If unsure, say N. - endmenu menu "Miscellaneous filesystems" -- cgit v1.2.3 From 0c6c98fb187524935a93fdd4f9a7193e7b110782 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Sat, 7 Jan 2006 20:07:02 +0100 Subject: [PATCH] OCFS2: __init / __exit problem Functions called by __init funtions mustn't be __exit. Reported by Jan-Benedict Glaw . Signed-off-by: Adrian Bunk Signed-off-by: Mark Fasheh --- fs/ocfs2/extent_map.c | 2 +- fs/ocfs2/uptodate.c | 2 +- fs/ocfs2/uptodate.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index f2fb40cd296..eb2bd8a4ca8 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -988,7 +988,7 @@ int __init init_ocfs2_extent_maps(void) return 0; } -void __exit exit_ocfs2_extent_maps(void) +void exit_ocfs2_extent_maps(void) { kmem_cache_destroy(ocfs2_em_ent_cachep); } diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c index 3a0458fd3e1..50c8fb3de0a 100644 --- a/fs/ocfs2/uptodate.c +++ b/fs/ocfs2/uptodate.c @@ -537,7 +537,7 @@ int __init init_ocfs2_uptodate_cache(void) return 0; } -void __exit exit_ocfs2_uptodate_cache(void) +void exit_ocfs2_uptodate_cache(void) { if (ocfs2_uptodate_cachep) kmem_cache_destroy(ocfs2_uptodate_cachep); diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h index e5aacdf4eab..01cd32d26b0 100644 --- a/fs/ocfs2/uptodate.h +++ b/fs/ocfs2/uptodate.h @@ -27,7 +27,7 @@ #define OCFS2_UPTODATE_H int __init init_ocfs2_uptodate_cache(void); -void __exit exit_ocfs2_uptodate_cache(void); +void exit_ocfs2_uptodate_cache(void); void ocfs2_metadata_cache_init(struct inode *inode); void ocfs2_metadata_cache_purge(struct inode *inode); -- cgit v1.2.3 From aee93ac4b7ad461255939248d0d51566cff77e05 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Mon, 9 Jan 2006 12:36:40 -0500 Subject: [PATCH] ocfs2/dlm: fix compilation on ia64 Including results in compilation failure on ia64 due to not including Including corrects the problem. Please apply. Signed-off-by: Jeff Mahoney Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/userdlm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c index e1fdd288796..c3764f4744e 100644 --- a/fs/ocfs2/dlm/userdlm.c +++ b/fs/ocfs2/dlm/userdlm.c @@ -27,7 +27,7 @@ * Boston, MA 021110-1307, USA. */ -#include +#include #include #include -- cgit v1.2.3 From 251b6eccbeff4f0f8a3509769b327705e899f5dd Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Tue, 10 Jan 2006 15:41:43 -0800 Subject: [OCFS2] Make ip_io_sem a mutex ip_io_sem is now ip_io_mutex. Signed-off-by: Mark Fasheh --- fs/ocfs2/buffer_head_io.c | 10 +++++----- fs/ocfs2/inode.c | 6 +++--- fs/ocfs2/inode.h | 4 ++-- fs/ocfs2/journal.c | 4 ++-- fs/ocfs2/super.c | 2 +- fs/ocfs2/uptodate.c | 10 +++++----- 6 files changed, 18 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index d424041b38e..bae3d7548be 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -58,7 +58,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, goto out; } - down(&OCFS2_I(inode)->ip_io_sem); + mutex_lock(&OCFS2_I(inode)->ip_io_mutex); lock_buffer(bh); set_buffer_uptodate(bh); @@ -82,7 +82,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, brelse(bh); } - up(&OCFS2_I(inode)->ip_io_sem); + mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); out: mlog_exit(ret); return ret; @@ -125,13 +125,13 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr, flags &= ~OCFS2_BH_CACHED; if (inode) - down(&OCFS2_I(inode)->ip_io_sem); + mutex_lock(&OCFS2_I(inode)->ip_io_mutex); for (i = 0 ; i < nr ; i++) { if (bhs[i] == NULL) { bhs[i] = sb_getblk(sb, block++); if (bhs[i] == NULL) { if (inode) - up(&OCFS2_I(inode)->ip_io_sem); + mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); status = -EIO; mlog_errno(status); goto bail; @@ -220,7 +220,7 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr, ocfs2_set_buffer_uptodate(inode, bh); } if (inode) - up(&OCFS2_I(inode)->ip_io_sem); + mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); mlog(ML_BH_IO, "block=(%"MLFu64"), nr=(%d), cached=%s\n", block, nr, (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes"); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index d4ecc062771..8122489c576 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -903,10 +903,10 @@ void ocfs2_clear_inode(struct inode *inode) "Clear inode of %"MLFu64", inode is locked\n", oi->ip_blkno); - mlog_bug_on_msg(down_trylock(&oi->ip_io_sem), - "Clear inode of %"MLFu64", io_sem is locked\n", + mlog_bug_on_msg(!mutex_trylock(&oi->ip_io_mutex), + "Clear inode of %"MLFu64", io_mutex is locked\n", oi->ip_blkno); - up(&oi->ip_io_sem); + mutex_unlock(&oi->ip_io_mutex); /* * down_trylock() returns 0, down_write_trylock() returns 1 diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 9b017743365..84c50796128 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -46,10 +46,10 @@ struct ocfs2_inode_info struct list_head ip_io_markers; int ip_orphaned_slot; - struct semaphore ip_io_sem; + struct mutex ip_io_mutex; /* Used by the journalling code to attach an inode to a - * handle. These are protected by ip_io_sem in order to lock + * handle. These are protected by ip_io_mutex in order to lock * out other I/O to the inode until we either commit or * abort. */ struct list_head ip_handle_list; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 303c8d96457..65bd69d1c71 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -401,7 +401,7 @@ int ocfs2_journal_access(struct ocfs2_journal_handle *handle, * j_trans_barrier for us. */ ocfs2_set_inode_lock_trans(OCFS2_SB(inode->i_sb)->journal, inode); - down(&OCFS2_I(inode)->ip_io_sem); + mutex_lock(&OCFS2_I(inode)->ip_io_mutex); switch (type) { case OCFS2_JOURNAL_ACCESS_CREATE: case OCFS2_JOURNAL_ACCESS_WRITE: @@ -416,7 +416,7 @@ int ocfs2_journal_access(struct ocfs2_journal_handle *handle, status = -EINVAL; mlog(ML_ERROR, "Uknown access type!\n"); } - up(&OCFS2_I(inode)->ip_io_sem); + mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); if (status < 0) mlog(ML_ERROR, "Error %d getting %d access to buffer!\n", diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 364d64bd5f1..c44075d4b57 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -932,7 +932,7 @@ static void ocfs2_inode_init_once(void *data, oi->ip_dir_start_lookup = 0; init_rwsem(&oi->ip_alloc_sem); - init_MUTEX(&(oi->ip_io_sem)); + mutex_init(&oi->ip_io_mutex); oi->ip_blkno = 0ULL; oi->ip_clusters = 0; diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c index 50c8fb3de0a..300b5bedfb2 100644 --- a/fs/ocfs2/uptodate.c +++ b/fs/ocfs2/uptodate.c @@ -388,7 +388,7 @@ out_free: } } -/* Item insertion is guarded by ip_io_sem, so the insertion path takes +/* Item insertion is guarded by ip_io_mutex, so the insertion path takes * advantage of this by not rechecking for a duplicate insert during * the slow case. Additionally, if the cache needs to be bumped up to * a tree, the code will not recheck after acquiring the lock -- @@ -418,7 +418,7 @@ void ocfs2_set_buffer_uptodate(struct inode *inode, (unsigned long long) bh->b_blocknr); /* No need to recheck under spinlock - insertion is guarded by - * ip_io_sem */ + * ip_io_mutex */ spin_lock(&oi->ip_lock); if (ocfs2_insert_can_use_array(oi, ci)) { /* Fast case - it's an array and there's a free @@ -440,7 +440,7 @@ void ocfs2_set_buffer_uptodate(struct inode *inode, /* Called against a newly allocated buffer. Most likely nobody should * be able to read this sort of metadata while it's still being - * allocated, but this is careful to take ip_io_sem anyway. */ + * allocated, but this is careful to take ip_io_mutex anyway. */ void ocfs2_set_new_buffer_uptodate(struct inode *inode, struct buffer_head *bh) { @@ -451,9 +451,9 @@ void ocfs2_set_new_buffer_uptodate(struct inode *inode, set_buffer_uptodate(bh); - down(&oi->ip_io_sem); + mutex_lock(&oi->ip_io_mutex); ocfs2_set_buffer_uptodate(inode, bh); - up(&oi->ip_io_sem); + mutex_unlock(&oi->ip_io_mutex); } /* Requires ip_lock. */ -- cgit v1.2.3 From e2faea4ce340f199c1957986c4c3dc2de76f5746 Mon Sep 17 00:00:00 2001 From: Kurt Hackel Date: Thu, 12 Jan 2006 14:24:55 -0800 Subject: [PATCH] ocfs2/dlm: fixes * fix a hang which can occur during shutdown migration * do not allow nodes to join during recovery * when restarting lock mastery, do not ignore nodes which come up * more than one node could become recovery master, fix this * sleep to allow some time for heartbeat state to catch up to network * extra debug info for bad recovery state problems * make DLM_RECO_NODE_DATA_DONE a valid state for non-master recovery nodes * prune all locks from dead nodes on $RECOVERY lock resources * do NOT automatically add new nodes to mle nodemaps until they have properly joined the domain * make sure dlm_pick_recovery_master only exits when all nodes have synced * properly handle dlmunlock errors in dlm_pick_recovery_master * do not propagate network errors in dlm_send_begin_reco_message * dead nodes were not being put in the recovery map sometimes, fix this * dlmunlock was failing to clear the unlock actions on DLM_DENIED Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/dlmcommon.h | 1 + fs/ocfs2/dlm/dlmdomain.c | 18 +++- fs/ocfs2/dlm/dlmmaster.c | 24 +++-- fs/ocfs2/dlm/dlmrecovery.c | 249 ++++++++++++++++++++++++++++++++++++++------- fs/ocfs2/dlm/dlmunlock.c | 13 +++ 5 files changed, 256 insertions(+), 49 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 3fecba0a602..42eb53b5293 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -657,6 +657,7 @@ void dlm_complete_thread(struct dlm_ctxt *dlm); int dlm_launch_recovery_thread(struct dlm_ctxt *dlm); void dlm_complete_recovery_thread(struct dlm_ctxt *dlm); void dlm_wait_for_recovery(struct dlm_ctxt *dlm); +int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node); void dlm_put(struct dlm_ctxt *dlm); struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm); diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index da3c22045f8..6ee30837389 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -573,8 +573,11 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data) spin_lock(&dlm_domain_lock); dlm = __dlm_lookup_domain_full(query->domain, query->name_len); /* Once the dlm ctxt is marked as leaving then we don't want - * to be put in someone's domain map. */ + * to be put in someone's domain map. + * Also, explicitly disallow joining at certain troublesome + * times (ie. during recovery). */ if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) { + int bit = query->node_idx; spin_lock(&dlm->spinlock); if (dlm->dlm_state == DLM_CTXT_NEW && @@ -586,6 +589,19 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data) } else if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) { /* Disallow parallel joins. */ response = JOIN_DISALLOW; + } else if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) { + mlog(ML_NOTICE, "node %u trying to join, but recovery " + "is ongoing.\n", bit); + response = JOIN_DISALLOW; + } else if (test_bit(bit, dlm->recovery_map)) { + mlog(ML_NOTICE, "node %u trying to join, but it " + "still needs recovery.\n", bit); + response = JOIN_DISALLOW; + } else if (test_bit(bit, dlm->domain_map)) { + mlog(ML_NOTICE, "node %u trying to join, but it " + "is still in the domain! needs recovery?\n", + bit); + response = JOIN_DISALLOW; } else { /* Alright we're fully a part of this domain * so we keep some state as to who's joining diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 27e984f7e4c..a3194fe173d 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -1050,17 +1050,10 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, node = dlm_bitmap_diff_iter_next(&bdi, &sc); while (node >= 0) { if (sc == NODE_UP) { - /* a node came up. easy. might not even need - * to talk to it if its node number is higher - * or if we are already blocked. */ - mlog(0, "node up! %d\n", node); - if (blocked) - goto next; - - if (node > dlm->node_num) { - mlog(0, "node > this node. skipping.\n"); - goto next; - } + /* a node came up. clear any old vote from + * the response map and set it in the vote map + * then restart the mastery. */ + mlog(ML_NOTICE, "node %d up while restarting\n", node); /* redo the master request, but only for the new node */ mlog(0, "sending request to new node\n"); @@ -2005,6 +1998,15 @@ fail: break; mlog(0, "timed out during migration\n"); + /* avoid hang during shutdown when migrating lockres + * to a node which also goes down */ + if (dlm_is_node_dead(dlm, target)) { + mlog(0, "%s:%.*s: expected migration target %u " + "is no longer up. restarting.\n", + dlm->name, res->lockname.len, + res->lockname.name, target); + ret = -ERESTARTSYS; + } } if (ret == -ERESTARTSYS) { /* migration failed, detach and clean up mle */ diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 0c8eb1093f0..325c9f5529c 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -256,6 +256,27 @@ static int dlm_recovery_thread(void *data) return 0; } +/* returns true when the recovery master has contacted us */ +static int dlm_reco_master_ready(struct dlm_ctxt *dlm) +{ + int ready; + spin_lock(&dlm->spinlock); + ready = (dlm->reco.new_master != O2NM_INVALID_NODE_NUM); + spin_unlock(&dlm->spinlock); + return ready; +} + +/* returns true if node is no longer in the domain + * could be dead or just not joined */ +int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node) +{ + int dead; + spin_lock(&dlm->spinlock); + dead = test_bit(node, dlm->domain_map); + spin_unlock(&dlm->spinlock); + return dead; +} + /* callers of the top-level api calls (dlmlock/dlmunlock) should * block on the dlm->reco.event when recovery is in progress. * the dlm recovery thread will set this state when it begins @@ -297,6 +318,7 @@ static void dlm_end_recovery(struct dlm_ctxt *dlm) static int dlm_do_recovery(struct dlm_ctxt *dlm) { int status = 0; + int ret; spin_lock(&dlm->spinlock); @@ -343,10 +365,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) goto master_here; if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) { - /* choose a new master */ - if (!dlm_pick_recovery_master(dlm)) { + /* choose a new master, returns 0 if this node + * is the master, -EEXIST if it's another node. + * this does not return until a new master is chosen + * or recovery completes entirely. */ + ret = dlm_pick_recovery_master(dlm); + if (!ret) { /* already notified everyone. go. */ - dlm->reco.new_master = dlm->node_num; goto master_here; } mlog(0, "another node will master this recovery session.\n"); @@ -371,8 +396,13 @@ master_here: if (status < 0) { mlog(ML_ERROR, "error %d remastering locks for node %u, " "retrying.\n", status, dlm->reco.dead_node); + /* yield a bit to allow any final network messages + * to get handled on remaining nodes */ + msleep(100); } else { /* success! see if any other nodes need recovery */ + mlog(0, "DONE mastering recovery of %s:%u here(this=%u)!\n", + dlm->name, dlm->reco.dead_node, dlm->node_num); dlm_reset_recovery(dlm); } dlm_end_recovery(dlm); @@ -477,7 +507,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) BUG(); break; case DLM_RECO_NODE_DATA_DEAD: - mlog(0, "node %u died after " + mlog(ML_NOTICE, "node %u died after " "requesting recovery info for " "node %u\n", ndata->node_num, dead_node); @@ -485,6 +515,19 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) // start all over destroy = 1; status = -EAGAIN; + /* instead of spinning like crazy here, + * wait for the domain map to catch up + * with the network state. otherwise this + * can be hit hundreds of times before + * the node is really seen as dead. */ + wait_event_timeout(dlm->dlm_reco_thread_wq, + dlm_is_node_dead(dlm, + ndata->node_num), + msecs_to_jiffies(1000)); + mlog(0, "waited 1 sec for %u, " + "dead? %s\n", ndata->node_num, + dlm_is_node_dead(dlm, ndata->node_num) ? + "yes" : "no"); goto leave; case DLM_RECO_NODE_DATA_RECEIVING: case DLM_RECO_NODE_DATA_REQUESTED: @@ -678,11 +721,27 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data) dlm = item->dlm; dead_node = item->u.ral.dead_node; reco_master = item->u.ral.reco_master; + mres = (struct dlm_migratable_lockres *)data; + + if (dead_node != dlm->reco.dead_node || + reco_master != dlm->reco.new_master) { + /* show extra debug info if the recovery state is messed */ + mlog(ML_ERROR, "%s: bad reco state: reco(dead=%u, master=%u), " + "request(dead=%u, master=%u)\n", + dlm->name, dlm->reco.dead_node, dlm->reco.new_master, + dead_node, reco_master); + mlog(ML_ERROR, "%s: name=%.*s master=%u locks=%u/%u flags=%u " + "entry[0]={c=%"MLFu64",l=%u,f=%u,t=%d,ct=%d,hb=%d,n=%u}\n", + dlm->name, mres->lockname_len, mres->lockname, mres->master, + mres->num_locks, mres->total_locks, mres->flags, + mres->ml[0].cookie, mres->ml[0].list, mres->ml[0].flags, + mres->ml[0].type, mres->ml[0].convert_type, + mres->ml[0].highest_blocked, mres->ml[0].node); + BUG(); + } BUG_ON(dead_node != dlm->reco.dead_node); BUG_ON(reco_master != dlm->reco.new_master); - mres = (struct dlm_migratable_lockres *)data; - /* lock resources should have already been moved to the * dlm->reco.resources list. now move items from that list * to a temp list if the dead owner matches. note that the @@ -757,15 +816,18 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data) continue; switch (ndata->state) { + /* should have moved beyond INIT but not to FINALIZE yet */ case DLM_RECO_NODE_DATA_INIT: case DLM_RECO_NODE_DATA_DEAD: - case DLM_RECO_NODE_DATA_DONE: case DLM_RECO_NODE_DATA_FINALIZE_SENT: mlog(ML_ERROR, "bad ndata state for node %u:" " state=%d\n", ndata->node_num, ndata->state); BUG(); break; + /* these states are possible at this point, anywhere along + * the line of recovery */ + case DLM_RECO_NODE_DATA_DONE: case DLM_RECO_NODE_DATA_RECEIVING: case DLM_RECO_NODE_DATA_REQUESTED: case DLM_RECO_NODE_DATA_REQUESTING: @@ -799,13 +861,31 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm, { struct dlm_lock_resource *res; struct list_head *iter, *iter2; + struct dlm_lock *lock; spin_lock(&dlm->spinlock); list_for_each_safe(iter, iter2, &dlm->reco.resources) { res = list_entry (iter, struct dlm_lock_resource, recovering); + /* always prune any $RECOVERY entries for dead nodes, + * otherwise hangs can occur during later recovery */ if (dlm_is_recovery_lock(res->lockname.name, - res->lockname.len)) + res->lockname.len)) { + spin_lock(&res->spinlock); + list_for_each_entry(lock, &res->granted, list) { + if (lock->ml.node == dead_node) { + mlog(0, "AHA! there was " + "a $RECOVERY lock for dead " + "node %u (%s)!\n", + dead_node, dlm->name); + list_del_init(&lock->list); + dlm_lock_put(lock); + break; + } + } + spin_unlock(&res->spinlock); continue; + } + if (res->owner == dead_node) { mlog(0, "found lockres owned by dead node while " "doing recovery for node %u. sending it.\n", @@ -1179,7 +1259,7 @@ static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data) again: ret = dlm_lockres_master_requery(dlm, res, &real_master); if (ret < 0) { - mlog(0, "dlm_lockres_master_requery failure: %d\n", + mlog(0, "dlm_lockres_master_requery ret=%d\n", ret); goto again; } @@ -1757,6 +1837,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) struct dlm_lock_resource *res; int i; struct list_head *bucket; + struct dlm_lock *lock; /* purge any stale mles */ @@ -1780,10 +1861,25 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) bucket = &(dlm->resources[i]); list_for_each(iter, bucket) { res = list_entry (iter, struct dlm_lock_resource, list); + /* always prune any $RECOVERY entries for dead nodes, + * otherwise hangs can occur during later recovery */ if (dlm_is_recovery_lock(res->lockname.name, - res->lockname.len)) + res->lockname.len)) { + spin_lock(&res->spinlock); + list_for_each_entry(lock, &res->granted, list) { + if (lock->ml.node == dead_node) { + mlog(0, "AHA! there was " + "a $RECOVERY lock for dead " + "node %u (%s)!\n", + dead_node, dlm->name); + list_del_init(&lock->list); + dlm_lock_put(lock); + break; + } + } + spin_unlock(&res->spinlock); continue; - + } spin_lock(&res->spinlock); /* zero the lvb if necessary */ dlm_revalidate_lvb(dlm, res, dead_node); @@ -1869,12 +1965,9 @@ void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data) return; spin_lock(&dlm->spinlock); - set_bit(idx, dlm->live_nodes_map); - - /* notify any mles attached to the heartbeat events */ - dlm_hb_event_notify_attached(dlm, idx, 1); - + /* do NOT notify mle attached to the heartbeat events. + * new nodes are not interesting in mastery until joined. */ spin_unlock(&dlm->spinlock); dlm_put(dlm); @@ -1897,7 +1990,18 @@ static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st) mlog(0, "unlockast for recovery lock fired!\n"); } - +/* + * dlm_pick_recovery_master will continually attempt to use + * dlmlock() on the special "$RECOVERY" lockres with the + * LKM_NOQUEUE flag to get an EX. every thread that enters + * this function on each node racing to become the recovery + * master will not stop attempting this until either: + * a) this node gets the EX (and becomes the recovery master), + * or b) dlm->reco.new_master gets set to some nodenum + * != O2NM_INVALID_NODE_NUM (another node will do the reco). + * so each time a recovery master is needed, the entire cluster + * will sync at this point. if the new master dies, that will + * be detected in dlm_do_recovery */ static int dlm_pick_recovery_master(struct dlm_ctxt *dlm) { enum dlm_status ret; @@ -1906,23 +2010,45 @@ static int dlm_pick_recovery_master(struct dlm_ctxt *dlm) mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n", dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num); -retry: +again: memset(&lksb, 0, sizeof(lksb)); ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY, DLM_RECOVERY_LOCK_NAME, dlm_reco_ast, dlm, dlm_reco_bast); + mlog(0, "%s: dlmlock($RECOVERY) returned %d, lksb=%d\n", + dlm->name, ret, lksb.status); + if (ret == DLM_NORMAL) { mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n", dlm->name, dlm->node_num); - /* I am master, send message to all nodes saying - * that I am beginning a recovery session */ - status = dlm_send_begin_reco_message(dlm, - dlm->reco.dead_node); + + /* got the EX lock. check to see if another node + * just became the reco master */ + if (dlm_reco_master_ready(dlm)) { + mlog(0, "%s: got reco EX lock, but %u will " + "do the recovery\n", dlm->name, + dlm->reco.new_master); + status = -EEXIST; + } else { + status = dlm_send_begin_reco_message(dlm, + dlm->reco.dead_node); + /* this always succeeds */ + BUG_ON(status); + + /* set the new_master to this node */ + spin_lock(&dlm->spinlock); + dlm->reco.new_master = dlm->node_num; + spin_unlock(&dlm->spinlock); + } /* recovery lock is a special case. ast will not get fired, * so just go ahead and unlock it. */ ret = dlmunlock(dlm, &lksb, 0, dlm_reco_unlock_ast, dlm); + if (ret == DLM_DENIED) { + mlog(0, "got DLM_DENIED, trying LKM_CANCEL\n"); + ret = dlmunlock(dlm, &lksb, LKM_CANCEL, dlm_reco_unlock_ast, dlm); + } if (ret != DLM_NORMAL) { /* this would really suck. this could only happen * if there was a network error during the unlock @@ -1930,20 +2056,42 @@ retry: * is actually "done" and the lock structure is * even freed. we can continue, but only * because this specific lock name is special. */ - mlog(0, "dlmunlock returned %d\n", ret); - } - - if (status < 0) { - mlog(0, "failed to send recovery message. " - "must retry with new node map.\n"); - goto retry; + mlog(ML_ERROR, "dlmunlock returned %d\n", ret); } } else if (ret == DLM_NOTQUEUED) { mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n", dlm->name, dlm->node_num); /* another node is master. wait on - * reco.new_master != O2NM_INVALID_NODE_NUM */ + * reco.new_master != O2NM_INVALID_NODE_NUM + * for at most one second */ + wait_event_timeout(dlm->dlm_reco_thread_wq, + dlm_reco_master_ready(dlm), + msecs_to_jiffies(1000)); + if (!dlm_reco_master_ready(dlm)) { + mlog(0, "%s: reco master taking awhile\n", + dlm->name); + goto again; + } + /* another node has informed this one that it is reco master */ + mlog(0, "%s: reco master %u is ready to recover %u\n", + dlm->name, dlm->reco.new_master, dlm->reco.dead_node); status = -EEXIST; + } else { + struct dlm_lock_resource *res; + + /* dlmlock returned something other than NOTQUEUED or NORMAL */ + mlog(ML_ERROR, "%s: got %s from dlmlock($RECOVERY), " + "lksb.status=%s\n", dlm->name, dlm_errname(ret), + dlm_errname(lksb.status)); + res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME, + DLM_RECOVERY_LOCK_NAME_LEN); + if (res) { + dlm_print_one_lock_resource(res); + dlm_lockres_put(res); + } else { + mlog(ML_ERROR, "recovery lock not found\n"); + } + BUG(); } return status; @@ -1982,7 +2130,7 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node) mlog(0, "not sending begin reco to self\n"); continue; } - +retry: ret = -EINVAL; mlog(0, "attempting to send begin reco msg to %d\n", nodenum); @@ -1991,8 +2139,17 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node) /* negative status is handled ok by caller here */ if (ret >= 0) ret = status; + if (dlm_is_host_down(ret)) { + /* node is down. not involved in recovery + * so just keep going */ + mlog(0, "%s: node %u was down when sending " + "begin reco msg (%d)\n", dlm->name, nodenum, ret); + ret = 0; + } if (ret < 0) { struct dlm_lock_resource *res; + /* this is now a serious problem, possibly ENOMEM + * in the network stack. must retry */ mlog_errno(ret); mlog(ML_ERROR, "begin reco of dlm %s to node %u " " returned %d\n", dlm->name, nodenum, ret); @@ -2004,7 +2161,10 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node) } else { mlog(ML_ERROR, "recovery lock not found\n"); } - break; + /* sleep for a bit in hopes that we can avoid + * another ENOMEM */ + msleep(100); + goto retry; } } @@ -2027,19 +2187,34 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data) spin_lock(&dlm->spinlock); if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) { - mlog(0, "new_master already set to %u!\n", - dlm->reco.new_master); + if (test_bit(dlm->reco.new_master, dlm->recovery_map)) { + mlog(0, "%s: new_master %u died, changing " + "to %u\n", dlm->name, dlm->reco.new_master, + br->node_idx); + } else { + mlog(0, "%s: new_master %u NOT DEAD, changing " + "to %u\n", dlm->name, dlm->reco.new_master, + br->node_idx); + /* may not have seen the new master as dead yet */ + } } if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) { - mlog(0, "dead_node already set to %u!\n", - dlm->reco.dead_node); + mlog(ML_NOTICE, "%s: dead_node previously set to %u, " + "node %u changing it to %u\n", dlm->name, + dlm->reco.dead_node, br->node_idx, br->dead_node); } dlm->reco.new_master = br->node_idx; dlm->reco.dead_node = br->dead_node; if (!test_bit(br->dead_node, dlm->recovery_map)) { - mlog(ML_ERROR, "recovery master %u sees %u as dead, but this " + mlog(0, "recovery master %u sees %u as dead, but this " "node has not yet. marking %u as dead\n", br->node_idx, br->dead_node, br->dead_node); + if (!test_bit(br->dead_node, dlm->domain_map) || + !test_bit(br->dead_node, dlm->live_nodes_map)) + mlog(0, "%u not in domain/live_nodes map " + "so setting it in reco map manually\n", + br->dead_node); + set_bit(br->dead_node, dlm->recovery_map); __dlm_hb_node_down(dlm, br->dead_node); } spin_unlock(&dlm->spinlock); diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index cec2ce1cd31..c95f08d2e92 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -188,6 +188,19 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, actions &= ~(DLM_UNLOCK_REMOVE_LOCK| DLM_UNLOCK_REGRANT_LOCK| DLM_UNLOCK_CLEAR_CONVERT_TYPE); + } else if (status == DLM_RECOVERING || + status == DLM_MIGRATING || + status == DLM_FORWARD) { + /* must clear the actions because this unlock + * is about to be retried. cannot free or do + * any list manipulation. */ + mlog(0, "%s:%.*s: clearing actions, %s\n", + dlm->name, res->lockname.len, + res->lockname.name, + status==DLM_RECOVERING?"recovering": + (status==DLM_MIGRATING?"migrating": + "forward")); + actions = 0; } if (flags & LKM_CANCEL) lock->cancel_pending = 0; -- cgit v1.2.3 From c74ec2f77a7763a4a56c6cb13ecab961e1bbb456 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 13 Jan 2006 21:54:23 -0800 Subject: [PATCH] ocfs2: Semaphore to mutex conversion. Semaphore to mutex conversion. The conversion was generated via scripts, and the result was validated automatically via a script as well. Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Mark Fasheh --- fs/ocfs2/journal.c | 10 +++++----- fs/ocfs2/ocfs2.h | 3 ++- fs/ocfs2/super.c | 6 +++--- 3 files changed, 10 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 65bd69d1c71..ccabed9a0aa 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1072,10 +1072,10 @@ restart: NULL); bail: - down(&osb->recovery_lock); + mutex_lock(&osb->recovery_lock); if (!status && !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) { - up(&osb->recovery_lock); + mutex_unlock(&osb->recovery_lock); goto restart; } @@ -1083,7 +1083,7 @@ bail: mb(); /* sync with ocfs2_recovery_thread_running */ wake_up(&osb->recovery_event); - up(&osb->recovery_lock); + mutex_unlock(&osb->recovery_lock); mlog_exit(status); /* no one is callint kthread_stop() for us so the kthread() api @@ -1098,7 +1098,7 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num) mlog_entry("(node_num=%d, osb->node_num = %d)\n", node_num, osb->node_num); - down(&osb->recovery_lock); + mutex_lock(&osb->recovery_lock); if (osb->disable_recovery) goto out; @@ -1120,7 +1120,7 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num) } out: - up(&osb->recovery_lock); + mutex_unlock(&osb->recovery_lock); wake_up(&osb->recovery_event); mlog_exit_void(); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index f468c600cf9..8d8e4779df9 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -33,6 +33,7 @@ #include #include #include +#include #include "cluster/nodemanager.h" #include "cluster/heartbeat.h" @@ -233,7 +234,7 @@ struct ocfs2_super struct proc_dir_entry *proc_sub_dir; /* points to /proc/fs/ocfs2/ */ atomic_t vol_state; - struct semaphore recovery_lock; + struct mutex recovery_lock; struct task_struct *recovery_thread_task; int disable_recovery; wait_queue_head_t checkpoint_event; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index c44075d4b57..e7e17bdf629 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1137,9 +1137,9 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) /* disable any new recovery threads and wait for any currently * running ones to exit. Do this before setting the vol_state. */ - down(&osb->recovery_lock); + mutex_lock(&osb->recovery_lock); osb->disable_recovery = 1; - up(&osb->recovery_lock); + mutex_unlock(&osb->recovery_lock); wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb)); /* At this point, we know that no more recovery threads can be @@ -1283,7 +1283,7 @@ static int ocfs2_initialize_super(struct super_block *sb, snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); - init_MUTEX(&osb->recovery_lock); + mutex_init(&osb->recovery_lock); osb->disable_recovery = 0; osb->recovery_thread_task = NULL; -- cgit v1.2.3 From b4c7f538508adcde7a0a5162faec0b2ab19b90bd Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Sat, 14 Jan 2006 20:55:10 +0100 Subject: [PATCH] fs/ocfs2/dlm/dlmrecovery.c must #include fs/ocfs2/dlm/dlmrecovery.c does now use msleep(), and does therefore need to #include for getting the prototype of this function. Signed-off-by: Adrian Bunk Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/dlmrecovery.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 325c9f5529c..186e9a76aa5 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -39,6 +39,7 @@ #include #include #include +#include #include "cluster/heartbeat.h" -- cgit v1.2.3 From ebdec83ba46c123fe3bfdcaacf62d0dfe8fe4187 Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn / snakebyte Date: Fri, 27 Jan 2006 10:32:52 +0100 Subject: [PATCH] BUG_ON() Conversion in fs/ocfs2/ this changes if() BUG(); constructs to BUG_ON() which is cleaner, contains unlikely() and can better optimized away. Signed-off-by: Eric Sesterhenn Signed-off-by: Mark Fasheh --- fs/ocfs2/extent_map.c | 10 ++++------ fs/ocfs2/journal.c | 12 ++++-------- fs/ocfs2/super.c | 3 +-- fs/ocfs2/sysfile.c | 6 ++---- 4 files changed, 11 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index eb2bd8a4ca8..b6ba292e954 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -262,8 +262,7 @@ static int ocfs2_extent_map_find_leaf(struct inode *inode, el = &eb->h_list; } - if (el->l_tree_depth) - BUG(); + BUG_ON(el->l_tree_depth); for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { rec = &el->l_recs[i]; @@ -364,8 +363,8 @@ static int ocfs2_extent_map_lookup_read(struct inode *inode, return ret; } - if (ent->e_tree_depth) - BUG(); /* FIXME: Make sure this isn't a corruption */ + /* FIXME: Make sure this isn't a corruption */ + BUG_ON(ent->e_tree_depth); *ret_ent = ent; @@ -423,8 +422,7 @@ static int ocfs2_extent_map_try_insert(struct inode *inode, le32_to_cpu(rec->e_clusters), NULL, NULL); - if (!old_ent) - BUG(); + BUG_ON(!old_ent); ret = -EEXIST; if (old_ent->e_tree_depth < tree_depth) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index ccabed9a0aa..b71b3385fdb 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -147,8 +147,7 @@ struct ocfs2_journal_handle *ocfs2_start_trans(struct ocfs2_super *osb, mlog_entry("(max_buffs = %d)\n", max_buffs); - if (!osb || !osb->journal->j_journal) - BUG(); + BUG_ON(!osb || !osb->journal->j_journal); if (ocfs2_is_hard_readonly(osb)) { ret = -EROFS; @@ -672,8 +671,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) mlog_entry_void(); - if (!osb) - BUG(); + BUG_ON(!osb); journal = osb->journal; if (!journal) @@ -805,8 +803,7 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full) mlog_entry_void(); - if (!journal) - BUG(); + BUG_ON(!journal); status = journal_wipe(journal->j_journal, full); if (status < 0) { @@ -1271,8 +1268,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb, /* Should not ever be called to recover ourselves -- in that * case we should've called ocfs2_journal_load instead. */ - if (osb->node_num == node_num) - BUG(); + BUG_ON(osb->node_num == node_num); slot_num = ocfs2_node_num_to_slot(si, node_num); if (slot_num == OCFS2_INVALID_SLOT) { diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index e7e17bdf629..046824b6b62 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1254,8 +1254,7 @@ static int ocfs2_initialize_super(struct super_block *sb, osb->sb = sb; /* Save off for ocfs2_rw_direct */ osb->s_sectsize_bits = blksize_bits(sector_size); - if (!osb->s_sectsize_bits) - BUG(); + BUG_ON(!osb->s_sectsize_bits); osb->net_response_ids = 0; spin_lock_init(&osb->net_response_lock); diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c index 600a8bc5b54..fc29cb7a437 100644 --- a/fs/ocfs2/sysfile.c +++ b/fs/ocfs2/sysfile.c @@ -77,8 +77,7 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb, if (arr && ((inode = *arr) != NULL)) { /* get a ref in addition to the array ref */ inode = igrab(inode); - if (!inode) - BUG(); + BUG_ON(!inode); return inode; } @@ -89,8 +88,7 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb, /* add one more if putting into array for first time */ if (arr && inode) { *arr = igrab(inode); - if (!*arr) - BUG(); + BUG_ON(!*arr); } return inode; } -- cgit v1.2.3 From 215c7f9fa11d3fc6ccd2df242d259c721ec7ae6a Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Wed, 1 Feb 2006 16:42:10 -0800 Subject: [PATCH] ocfs2: fix compile warnings Fix a couple of compile warnings found when compiling on a ppc64 build box. Signed-off-by: Mark Fasheh --- fs/ocfs2/cluster/heartbeat.c | 5 +++-- fs/ocfs2/cluster/tcp.c | 16 +++++++++------- fs/ocfs2/file.c | 10 ++++++---- 3 files changed, 18 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 7307ba52891..d08971d29b6 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -917,8 +917,9 @@ static int o2hb_thread(void *data) elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); mlog(0, "start = %lu.%lu, end = %lu.%lu, msec = %u\n", - before_hb.tv_sec, before_hb.tv_usec, - after_hb.tv_sec, after_hb.tv_usec, elapsed_msec); + before_hb.tv_sec, (unsigned long) before_hb.tv_usec, + after_hb.tv_sec, (unsigned long) after_hb.tv_usec, + elapsed_msec); if (elapsed_msec < reg->hr_timeout_ms) { /* the kthread api has blocked signals for us so no diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 35d92c01a97..d22d4cf08db 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1285,14 +1285,16 @@ static void o2net_idle_timer(unsigned long data) mlog(ML_NOTICE, "here are some times that might help debug the " "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv " "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n", - sc->sc_tv_timer.tv_sec, sc->sc_tv_timer.tv_usec, - now.tv_sec, now.tv_usec, - sc->sc_tv_data_ready.tv_sec, sc->sc_tv_data_ready.tv_usec, - sc->sc_tv_advance_start.tv_sec, sc->sc_tv_advance_start.tv_usec, - sc->sc_tv_advance_stop.tv_sec, sc->sc_tv_advance_stop.tv_usec, + sc->sc_tv_timer.tv_sec, (long) sc->sc_tv_timer.tv_usec, + now.tv_sec, (long) now.tv_usec, + sc->sc_tv_data_ready.tv_sec, (long) sc->sc_tv_data_ready.tv_usec, + sc->sc_tv_advance_start.tv_sec, + (long) sc->sc_tv_advance_start.tv_usec, + sc->sc_tv_advance_stop.tv_sec, + (long) sc->sc_tv_advance_stop.tv_usec, sc->sc_msg_key, sc->sc_msg_type, - sc->sc_tv_func_start.tv_sec, sc->sc_tv_func_start.tv_usec, - sc->sc_tv_func_stop.tv_sec, sc->sc_tv_func_stop.tv_usec); + sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec, + sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec); o2net_sc_queue_work(sc, &sc->sc_shutdown_work); } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index eaf33caa0a1..1715bc90e70 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1022,8 +1022,9 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, } newsize = count + saved_pos; - mlog(0, "pos=%lld newsize=%"MLFu64" cursize=%lld\n", - saved_pos, newsize, i_size_read(inode)); + mlog(0, "pos=%lld newsize=%lld cursize=%lld\n", + (long long) saved_pos, (long long) newsize, + (long long) i_size_read(inode)); /* No need for a higher level metadata lock if we're * never going past i_size. */ @@ -1042,8 +1043,9 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, spin_unlock(&OCFS2_I(inode)->ip_lock); mlog(0, "Writing at EOF, may need more allocation: " - "i_size = %lld, newsize = %"MLFu64", need %u clusters\n", - i_size_read(inode), newsize, clusters); + "i_size = %lld, newsize = %lld, need %u clusters\n", + (long long) i_size_read(inode), (long long) newsize, + clusters); /* We only want to continue the rest of this loop if * our extend will actually require more -- cgit v1.2.3 From 3d0f89bb169482d26d5aa4e82e763077e7e9bc4d Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Wed, 25 Jan 2006 13:31:07 -0800 Subject: configfs: Add permission and ownership to configfs objects. configfs always made item and attribute ownership root.root and permissions based on a umask of 022. Add ->setattr() to allow chown(2)/chmod(2), and persist the changes for the lifetime of the items and attributes. Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/configfs/configfs_internal.h | 11 ++-- fs/configfs/dir.c | 36 +++++++++---- fs/configfs/file.c | 19 +++---- fs/configfs/inode.c | 117 ++++++++++++++++++++++++++++++++++++---- fs/configfs/mount.c | 28 ++++++++-- fs/configfs/symlink.c | 1 + 6 files changed, 176 insertions(+), 36 deletions(-) (limited to 'fs') diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h index 8899d9c5f6b..f70e46951b3 100644 --- a/fs/configfs/configfs_internal.h +++ b/fs/configfs/configfs_internal.h @@ -36,6 +36,7 @@ struct configfs_dirent { int s_type; umode_t s_mode; struct dentry * s_dentry; + struct iattr * s_iattr; }; #define CONFIGFS_ROOT 0x0001 @@ -48,10 +49,11 @@ struct configfs_dirent { #define CONFIGFS_NOT_PINNED (CONFIGFS_ITEM_ATTR) extern struct vfsmount * configfs_mount; +extern kmem_cache_t *configfs_dir_cachep; extern int configfs_is_root(struct config_item *item); -extern struct inode * configfs_new_inode(mode_t mode); +extern struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent *); extern int configfs_create(struct dentry *, int mode, int (*init)(struct inode *)); extern int configfs_create_file(struct config_item *, const struct configfs_attribute *); @@ -63,6 +65,7 @@ extern void configfs_hash_and_remove(struct dentry * dir, const char * name); extern const unsigned char * configfs_get_name(struct configfs_dirent *sd); extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent); +extern int configfs_setattr(struct dentry *dentry, struct iattr *iattr); extern int configfs_pin_fs(void); extern void configfs_release_fs(void); @@ -120,8 +123,10 @@ static inline struct config_item *configfs_get_config_item(struct dentry *dentry static inline void release_configfs_dirent(struct configfs_dirent * sd) { - if (!(sd->s_type & CONFIGFS_ROOT)) - kfree(sd); + if (!(sd->s_type & CONFIGFS_ROOT)) { + kfree(sd->s_iattr); + kmem_cache_free(configfs_dir_cachep, sd); + } } static inline struct configfs_dirent * configfs_get(struct configfs_dirent * sd) diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index b668ec61527..ca60e3abef4 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -72,7 +72,7 @@ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * pare { struct configfs_dirent * sd; - sd = kmalloc(sizeof(*sd), GFP_KERNEL); + sd = kmem_cache_alloc(configfs_dir_cachep, GFP_KERNEL); if (!sd) return NULL; @@ -136,13 +136,19 @@ static int create_dir(struct config_item * k, struct dentry * p, int error; umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; - error = configfs_create(d, mode, init_dir); + error = configfs_make_dirent(p->d_fsdata, d, k, mode, + CONFIGFS_DIR); if (!error) { - error = configfs_make_dirent(p->d_fsdata, d, k, mode, - CONFIGFS_DIR); + error = configfs_create(d, mode, init_dir); if (!error) { p->d_inode->i_nlink++; (d)->d_op = &configfs_dentry_ops; + } else { + struct configfs_dirent *sd = d->d_fsdata; + if (sd) { + list_del_init(&sd->s_sibling); + configfs_put(sd); + } } } return error; @@ -182,12 +188,19 @@ int configfs_create_link(struct configfs_symlink *sl, int err = 0; umode_t mode = S_IFLNK | S_IRWXUGO; - err = configfs_create(dentry, mode, init_symlink); + err = configfs_make_dirent(parent->d_fsdata, dentry, sl, mode, + CONFIGFS_ITEM_LINK); if (!err) { - err = configfs_make_dirent(parent->d_fsdata, dentry, sl, - mode, CONFIGFS_ITEM_LINK); + err = configfs_create(dentry, mode, init_symlink); if (!err) dentry->d_op = &configfs_dentry_ops; + else { + struct configfs_dirent *sd = dentry->d_fsdata; + if (sd) { + list_del_init(&sd->s_sibling); + configfs_put(sd); + } + } } return err; } @@ -241,13 +254,15 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den struct configfs_attribute * attr = sd->s_element; int error; + dentry->d_fsdata = configfs_get(sd); + sd->s_dentry = dentry; error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG, init_file); - if (error) + if (error) { + configfs_put(sd); return error; + } dentry->d_op = &configfs_dentry_ops; - dentry->d_fsdata = configfs_get(sd); - sd->s_dentry = dentry; d_rehash(dentry); return 0; @@ -839,6 +854,7 @@ struct inode_operations configfs_dir_inode_operations = { .symlink = configfs_symlink, .unlink = configfs_unlink, .lookup = configfs_lookup, + .setattr = configfs_setattr, }; #if 0 diff --git a/fs/configfs/file.c b/fs/configfs/file.c index c26cd61f13a..3921920d871 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -26,7 +26,6 @@ #include #include -#include #include #include #include @@ -150,7 +149,7 @@ out: /** * fill_write_buffer - copy buffer from userspace. * @buffer: data buffer for file. - * @userbuf: data from user. + * @buf: data from user. * @count: number of bytes in @userbuf. * * Allocate @buffer->page if it hasn't been already, then @@ -177,8 +176,9 @@ fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size /** * flush_write_buffer - push buffer to config_item. - * @file: file pointer. + * @dentry: dentry to the attribute * @buffer: data buffer for file. + * @count: number of bytes * * Get the correct pointers for the config_item and the attribute we're * dealing with, then call the store() method for the attribute, @@ -217,15 +217,16 @@ static ssize_t configfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct configfs_buffer * buffer = file->private_data; + ssize_t len; down(&buffer->sem); - count = fill_write_buffer(buffer,buf,count); - if (count > 0) - count = flush_write_buffer(file->f_dentry,buffer,count); - if (count > 0) - *ppos += count; + len = fill_write_buffer(buffer, buf, count); + if (len > 0) + len = flush_write_buffer(file->f_dentry, buffer, count); + if (len > 0) + *ppos += len; up(&buffer->sem); - return count; + return len; } static int check_perm(struct inode * inode, struct file * file) diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 6577c588de9..737842f2764 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include "configfs_internal.h" @@ -48,18 +49,107 @@ static struct backing_dev_info configfs_backing_dev_info = { .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, }; -struct inode * configfs_new_inode(mode_t mode) +static struct inode_operations configfs_inode_operations ={ + .setattr = configfs_setattr, +}; + +int configfs_setattr(struct dentry * dentry, struct iattr * iattr) +{ + struct inode * inode = dentry->d_inode; + struct configfs_dirent * sd = dentry->d_fsdata; + struct iattr * sd_iattr; + unsigned int ia_valid = iattr->ia_valid; + int error; + + if (!sd) + return -EINVAL; + + sd_iattr = sd->s_iattr; + + error = inode_change_ok(inode, iattr); + if (error) + return error; + + error = inode_setattr(inode, iattr); + if (error) + return error; + + if (!sd_iattr) { + /* setting attributes for the first time, allocate now */ + sd_iattr = kmalloc(sizeof(struct iattr), GFP_KERNEL); + if (!sd_iattr) + return -ENOMEM; + /* assign default attributes */ + memset(sd_iattr, 0, sizeof(struct iattr)); + sd_iattr->ia_mode = sd->s_mode; + sd_iattr->ia_uid = 0; + sd_iattr->ia_gid = 0; + sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME; + sd->s_iattr = sd_iattr; + } + + /* attributes were changed atleast once in past */ + + if (ia_valid & ATTR_UID) + sd_iattr->ia_uid = iattr->ia_uid; + if (ia_valid & ATTR_GID) + sd_iattr->ia_gid = iattr->ia_gid; + if (ia_valid & ATTR_ATIME) + sd_iattr->ia_atime = timespec_trunc(iattr->ia_atime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_MTIME) + sd_iattr->ia_mtime = timespec_trunc(iattr->ia_mtime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_CTIME) + sd_iattr->ia_ctime = timespec_trunc(iattr->ia_ctime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_MODE) { + umode_t mode = iattr->ia_mode; + + if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + mode &= ~S_ISGID; + sd_iattr->ia_mode = sd->s_mode = mode; + } + + return error; +} + +static inline void set_default_inode_attr(struct inode * inode, mode_t mode) +{ + inode->i_mode = mode; + inode->i_uid = 0; + inode->i_gid = 0; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; +} + +static inline void set_inode_attr(struct inode * inode, struct iattr * iattr) +{ + inode->i_mode = iattr->ia_mode; + inode->i_uid = iattr->ia_uid; + inode->i_gid = iattr->ia_gid; + inode->i_atime = iattr->ia_atime; + inode->i_mtime = iattr->ia_mtime; + inode->i_ctime = iattr->ia_ctime; +} + +struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd) { struct inode * inode = new_inode(configfs_sb); if (inode) { - inode->i_mode = mode; - inode->i_uid = 0; - inode->i_gid = 0; inode->i_blksize = PAGE_CACHE_SIZE; inode->i_blocks = 0; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_mapping->a_ops = &configfs_aops; inode->i_mapping->backing_dev_info = &configfs_backing_dev_info; + inode->i_op = &configfs_inode_operations; + + if (sd->s_iattr) { + /* sysfs_dirent has non-default attributes + * get them for the new inode from persistent copy + * in sysfs_dirent + */ + set_inode_attr(inode, sd->s_iattr); + } else + set_default_inode_attr(inode, mode); } return inode; } @@ -70,7 +160,8 @@ int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode * struct inode * inode = NULL; if (dentry) { if (!dentry->d_inode) { - if ((inode = configfs_new_inode(mode))) { + struct configfs_dirent *sd = dentry->d_fsdata; + if ((inode = configfs_new_inode(mode, sd))) { if (dentry->d_parent && dentry->d_parent->d_inode) { struct inode *p_inode = dentry->d_parent->d_inode; p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME; @@ -103,7 +194,7 @@ int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode * */ const unsigned char * configfs_get_name(struct configfs_dirent *sd) { - struct attribute * attr; + struct configfs_attribute *attr; if (!sd || !sd->s_element) BUG(); @@ -114,7 +205,7 @@ const unsigned char * configfs_get_name(struct configfs_dirent *sd) if (sd->s_type & CONFIGFS_ITEM_ATTR) { attr = sd->s_element; - return attr->name; + return attr->ca_name; } return NULL; } @@ -130,13 +221,17 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent) if (dentry) { spin_lock(&dcache_lock); + spin_lock(&dentry->d_lock); if (!(d_unhashed(dentry) && dentry->d_inode)) { dget_locked(dentry); __d_drop(dentry); + spin_unlock(&dentry->d_lock); spin_unlock(&dcache_lock); simple_unlink(parent->d_inode, dentry); - } else + } else { + spin_unlock(&dentry->d_lock); spin_unlock(&dcache_lock); + } } } @@ -145,6 +240,10 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name) struct configfs_dirent * sd; struct configfs_dirent * parent_sd = dir->d_fsdata; + if (dir->d_inode == NULL) + /* no inode means this hasn't been made visible yet */ + return; + mutex_lock(&dir->d_inode->i_mutex); list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { if (!sd->s_element) diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index 1a2f6f6a4d9..f920d30478e 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -38,6 +38,7 @@ struct vfsmount * configfs_mount = NULL; struct super_block * configfs_sb = NULL; +kmem_cache_t *configfs_dir_cachep; static int configfs_mnt_count = 0; static struct super_operations configfs_ops = { @@ -62,6 +63,7 @@ static struct configfs_dirent configfs_root = { .s_children = LIST_HEAD_INIT(configfs_root.s_children), .s_element = &configfs_root_group.cg_item, .s_type = CONFIGFS_ROOT, + .s_iattr = NULL, }; static int configfs_fill_super(struct super_block *sb, void *data, int silent) @@ -73,9 +75,11 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_blocksize_bits = PAGE_CACHE_SHIFT; sb->s_magic = CONFIGFS_MAGIC; sb->s_op = &configfs_ops; + sb->s_time_gran = 1; configfs_sb = sb; - inode = configfs_new_inode(S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO); + inode = configfs_new_inode(S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO, + &configfs_root); if (inode) { inode->i_op = &configfs_dir_inode_operations; inode->i_fop = &configfs_dir_operations; @@ -128,19 +132,31 @@ static decl_subsys(config, NULL, NULL); static int __init configfs_init(void) { - int err; + int err = -ENOMEM; + + configfs_dir_cachep = kmem_cache_create("configfs_dir_cache", + sizeof(struct configfs_dirent), + 0, 0, NULL, NULL); + if (!configfs_dir_cachep) + goto out; kset_set_kset_s(&config_subsys, kernel_subsys); err = subsystem_register(&config_subsys); - if (err) - return err; + if (err) { + kmem_cache_destroy(configfs_dir_cachep); + configfs_dir_cachep = NULL; + goto out; + } err = register_filesystem(&configfs_fs_type); if (err) { printk(KERN_ERR "configfs: Unable to register filesystem!\n"); subsystem_unregister(&config_subsys); + kmem_cache_destroy(configfs_dir_cachep); + configfs_dir_cachep = NULL; } +out: return err; } @@ -148,11 +164,13 @@ static void __exit configfs_exit(void) { unregister_filesystem(&configfs_fs_type); subsystem_unregister(&config_subsys); + kmem_cache_destroy(configfs_dir_cachep); + configfs_dir_cachep = NULL; } MODULE_AUTHOR("Oracle"); MODULE_LICENSE("GPL"); -MODULE_VERSION("0.0.1"); +MODULE_VERSION("0.0.2"); MODULE_DESCRIPTION("Simple RAM filesystem for user driven kernel subsystem configuration."); module_init(configfs_init); diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index 50f5840521a..99137026b40 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -277,5 +277,6 @@ struct inode_operations configfs_symlink_inode_operations = { .follow_link = configfs_follow_link, .readlink = generic_readlink, .put_link = configfs_put_link, + .setattr = configfs_setattr, }; -- cgit v1.2.3 From 1a1974fd4533afdb73873cdacb942d9a79ff7c9b Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn / snakebyte Date: Fri, 27 Jan 2006 10:32:24 +0100 Subject: [PATCH] BUG_ON() Conversion in fs/configfs/ this changes if() BUG(); constructs to BUG_ON() which is cleaner, contains unlikely() and can better optimized away. Signed-off-by: Eric Sesterhenn Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/configfs/inode.c | 3 +-- fs/configfs/symlink.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 737842f2764..c153bd9534c 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -196,8 +196,7 @@ const unsigned char * configfs_get_name(struct configfs_dirent *sd) { struct configfs_attribute *attr; - if (!sd || !sd->s_element) - BUG(); + BUG_ON(!sd || !sd->s_element); /* These always have a dentry, so use that */ if (sd->s_type & (CONFIGFS_DIR | CONFIGFS_ITEM_LINK)) diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index 99137026b40..e5512e295cf 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -162,8 +162,7 @@ int configfs_unlink(struct inode *dir, struct dentry *dentry) if (!(sd->s_type & CONFIGFS_ITEM_LINK)) goto out; - if (dentry->d_parent == configfs_sb->s_root) - BUG(); + BUG_ON(dentry->d_parent == configfs_sb->s_root); sl = sd->s_element; -- cgit v1.2.3 From 6eff5790d57a5c9c01489c95946881808a4b2a2c Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Wed, 18 Jan 2006 10:31:47 -0800 Subject: [PATCH] ocfs2: don't wait on recovery when locking journal The mount path had incorrectly asked the locking code to wait for recovery completion, which deadlocks things because recovery waits for mount to complete first. Signed-off-by: Mark Fasheh --- fs/ocfs2/journal.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index b71b3385fdb..fa0bcac5cea 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -560,7 +560,11 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty) SET_INODE_JOURNAL(inode); OCFS2_I(inode)->ip_open_count++; - status = ocfs2_meta_lock(inode, NULL, &bh, 1); + /* Skip recovery waits here - journal inode metadata never + * changes in a live cluster so it can be considered an + * exception to the rule. */ + status = ocfs2_meta_lock_full(inode, NULL, &bh, 1, + OCFS2_META_LOCK_RECOVERY); if (status < 0) { if (status != -ERESTARTSYS) mlog(ML_ERROR, "Could not get lock on journal!\n"); -- cgit v1.2.3 From 88a2a4ac6b671a4b0dd5d2d762418904c05f4104 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 4 Feb 2006 23:27:36 -0800 Subject: [PATCH] percpu data: only iterate over possible CPUs percpu_data blindly allocates bootmem memory to store NR_CPUS instances of cpudata, instead of allocating memory only for possible cpus. As a preparation for changing that, we need to convert various 0 -> NR_CPUS loops to use for_each_cpu(). (The above only applies to users of asm-generic/percpu.h. powerpc has gone it alone and is presently only allocating memory for present CPUs, so it's currently corrupting memory). Signed-off-by: Eric Dumazet Cc: "David S. Miller" Cc: James Bottomley Acked-by: Ingo Molnar Cc: Jens Axboe Cc: Anton Blanchard Acked-by: William Irwin Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/file.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/file.c b/fs/file.c index fd066b261c7..cea7cbea11d 100644 --- a/fs/file.c +++ b/fs/file.c @@ -379,7 +379,6 @@ static void __devinit fdtable_defer_list_init(int cpu) void __init files_defer_init(void) { int i; - /* Really early - can't use for_each_cpu */ - for (i = 0; i < NR_CPUS; i++) + for_each_cpu(i) fdtable_defer_list_init(i); } -- cgit v1.2.3 From 7128ec2a747d7a5f3c764c37bef17081ccc2374c Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sat, 4 Feb 2006 23:27:40 -0800 Subject: [PATCH] fuse: fix request_end() vs fuse_reset_request() race The last fix for this function in fact opened up a much more often triggering race. It was uncommented tricky code, that was buggy. Add comment, make it less tricky and fix bug. Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/dev.c | 40 +++++++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 4526da8907c..f556a0d5c0d 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -120,9 +120,9 @@ struct fuse_req *fuse_get_request(struct fuse_conn *fc) return do_get_request(fc); } +/* Must be called with fuse_lock held */ static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req) { - spin_lock(&fuse_lock); if (req->preallocated) { atomic_dec(&fc->num_waiting); list_add(&req->list, &fc->unused_list); @@ -134,10 +134,18 @@ static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req) fc->outstanding_debt--; else up(&fc->outstanding_sem); - spin_unlock(&fuse_lock); } void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) +{ + if (atomic_dec_and_test(&req->count)) { + spin_lock(&fuse_lock); + fuse_putback_request(fc, req); + spin_unlock(&fuse_lock); + } +} + +static void fuse_put_request_locked(struct fuse_conn *fc, struct fuse_req *req) { if (atomic_dec_and_test(&req->count)) fuse_putback_request(fc, req); @@ -163,26 +171,36 @@ void fuse_release_background(struct fuse_req *req) * still waiting), the 'end' callback is called if given, else the * reference to the request is released * + * Releasing extra reference for foreground requests must be done + * within the same locked region as setting state to finished. This + * is because fuse_reset_request() may be called after request is + * finished and it must be the sole possessor. If request is + * interrupted and put in the background, it will return with an error + * and hence never be reset and reused. + * * Called with fuse_lock, unlocks it */ static void request_end(struct fuse_conn *fc, struct fuse_req *req) { - void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; - req->end = NULL; list_del(&req->list); req->state = FUSE_REQ_FINISHED; - spin_unlock(&fuse_lock); - if (req->background) { + if (!req->background) { + wake_up(&req->waitq); + fuse_put_request_locked(fc, req); + spin_unlock(&fuse_lock); + } else { + void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; + req->end = NULL; + spin_unlock(&fuse_lock); down_read(&fc->sbput_sem); if (fc->mounted) fuse_release_background(req); up_read(&fc->sbput_sem); + if (end) + end(fc, req); + else + fuse_put_request(fc, req); } - wake_up(&req->waitq); - if (end) - end(fc, req); - else - fuse_put_request(fc, req); } /* -- cgit v1.2.3 From fe1dcbc4f311c2e6c23b33c0fa8572461618ab3e Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 4 Feb 2006 23:27:54 -0800 Subject: [PATCH] jbd: fix transaction batching Ben points out that: When writing files out using O_SYNC, jbd's 1 jiffy delay results in a significant drop in throughput as the disk sits idle. The patch below results in a 4-5x performance improvement (from 6.5MB/s to ~24-30MB/s on my IDE test box) when writing out files using O_SYNC. So optimise the batching code by omitting it entirely if the process which is doing a sync write is the same as the one which did the most recent sync write. If that's true, we're unlikely to get any other processes joining the transaction. (Has been in -mm for ages - it took me a long time to get on to performance testing it) Numbers, on write-cache-disabled IDE: /usr/bin/time -p synctest -n 10 -uf -t 1 -p 1 dir-name Unpatched: 40 seconds Patched: 35 seconds Batching disabled: 35 seconds This is the problematic single-process-doing-fsync case. With multiple fsyncing processes the numbers are AFACIT unaltered by the patch. Aside: performance testing and instrumentation shows that the transaction batching almost doesn't help (testing with synctest -n 1 -uf -t 100 -p 10 dir-name on non-writeback-caching IDE). This is because by the time one process is running a synchronous commit, a bunch of other processes already have a transaction handle open, so they're all going to batch into the same transaction anyway. The batching seems to offer maybe 5-10% speedup with this workload, but I'm pretty sure it was more important than that when it was first developed 4-odd years ago... Cc: "Stephen C. Tweedie" Cc: Benjamin LaHaise Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/jbd/transaction.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 429f4b263cf..ca917973c2c 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -1308,6 +1308,7 @@ int journal_stop(handle_t *handle) transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; int old_handle_count, err; + pid_t pid; J_ASSERT(transaction->t_updates > 0); J_ASSERT(journal_current_handle() == handle); @@ -1333,8 +1334,15 @@ int journal_stop(handle_t *handle) * It doesn't cost much - we're about to run a commit and sleep * on IO anyway. Speeds up many-threaded, many-dir operations * by 30x or more... + * + * But don't do this if this process was the most recent one to + * perform a synchronous write. We do this to detect the case where a + * single process is doing a stream of sync writes. No point in waiting + * for joiners in that case. */ - if (handle->h_sync) { + pid = current->pid; + if (handle->h_sync && journal->j_last_sync_writer != pid) { + journal->j_last_sync_writer = pid; do { old_handle_count = transaction->t_handle_count; schedule_timeout_uninterruptible(1); -- cgit v1.2.3 From f55eab822b93864ef4eef3bd7eadac2a727c914b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 4 Feb 2006 23:28:01 -0800 Subject: [PATCH] VFS: Ensure LOOKUP_CONTINUE flag is preserved by link_path_walk() When walking a path, the LOOKUP_CONTINUE flag is used by some filesystems (for instance NFS) in order to determine whether or not it is looking up the last component of the path. It this is the case, it may have to look at the intent information in order to perform various tasks such as atomic open. A problem currently occurs when link_path_walk() hits a symlink. In this case LOOKUP_CONTINUE may be cleared prematurely when we hit the end of the path passed by __vfs_follow_link() (i.e. the end of the symlink path) rather than when we hit the end of the path passed by the user. The solution is to have link_path_walk() clear LOOKUP_CONTINUE if and only if that flag was unset when we entered the function. Signed-off-by: Trond Myklebust Cc: Al Viro Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namei.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 7ac9fb4acb2..b760e1e18b4 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -790,7 +790,7 @@ static fastcall int __link_path_walk(const char * name, struct nameidata *nd) inode = nd->dentry->d_inode; if (nd->depth) - lookup_flags = LOOKUP_FOLLOW; + lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE); /* At this point we know we have a real path component. */ for(;;) { @@ -885,7 +885,8 @@ static fastcall int __link_path_walk(const char * name, struct nameidata *nd) last_with_slashes: lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; last_component: - nd->flags &= ~LOOKUP_CONTINUE; + /* Clear LOOKUP_CONTINUE iff it was previously unset */ + nd->flags &= lookup_flags | ~LOOKUP_CONTINUE; if (lookup_flags & LOOKUP_PARENT) goto lookup_parent; if (this.name[0] == '.') switch (this.len) { -- cgit v1.2.3 From 170aa3d02614ae621d54af10555e2f48977ae8de Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Sat, 4 Feb 2006 23:28:02 -0800 Subject: [PATCH] namei.c: unlock missing in error case Signed-off-by: Ulrich Drepper Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namei.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index b760e1e18b4..faf61c35308 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1070,6 +1070,8 @@ static int fastcall do_path_lookup(int dfd, const char *name, unsigned int flags, struct nameidata *nd) { int retval = 0; + int fput_needed; + struct file *file; nd->last_type = LAST_ROOT; /* if there are only slashes... */ nd->flags = flags; @@ -1091,29 +1093,22 @@ static int fastcall do_path_lookup(int dfd, const char *name, nd->mnt = mntget(current->fs->pwdmnt); nd->dentry = dget(current->fs->pwd); } else { - struct file *file; - int fput_needed; struct dentry *dentry; file = fget_light(dfd, &fput_needed); - if (!file) { - retval = -EBADF; - goto out_fail; - } + retval = -EBADF; + if (!file) + goto unlock_fail; dentry = file->f_dentry; - if (!S_ISDIR(dentry->d_inode->i_mode)) { - retval = -ENOTDIR; - fput_light(file, fput_needed); - goto out_fail; - } + retval = -ENOTDIR; + if (!S_ISDIR(dentry->d_inode->i_mode)) + goto fput_unlock_fail; retval = file_permission(file, MAY_EXEC); - if (retval) { - fput_light(file, fput_needed); - goto out_fail; - } + if (retval) + goto fput_unlock_fail; nd->mnt = mntget(file->f_vfsmnt); nd->dentry = dget(dentry); @@ -1127,7 +1122,12 @@ out: if (unlikely(current->audit_context && nd && nd->dentry && nd->dentry->d_inode)) audit_inode(name, nd->dentry->d_inode, flags); -out_fail: + return retval; + +fput_unlock_fail: + fput_light(file, fput_needed); +unlock_fail: + read_unlock(¤t->fs->lock); return retval; } -- cgit v1.2.3