From ea319518ba3de282c13ae1cf4bf2215c5e03e67e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Dec 2008 15:08:55 +0100 Subject: locking, percpu counters: introduce separate lock classes Impact: fix lockdep false positives Classify percpu_counter instances similar to regular lock objects -- that is, per instantiation site. The networking code has increased its use of percpu_counters, which leads to false positives if they are treated as a single class. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- mm/backing-dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index f2e574dbc30..f3b12585782 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -220,7 +220,7 @@ int bdi_init(struct backing_dev_info *bdi) bdi->max_prop_frac = PROP_FRAC_BASE; for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { - err = percpu_counter_init_irq(&bdi->bdi_stat[i], 0); + err = percpu_counter_init(&bdi->bdi_stat[i], 0); if (err) goto err; } -- cgit v1.2.3 From 2e4e27c7d082b2198b63041310609d7191185a9d Mon Sep 17 00:00:00 2001 From: Adam Lackorzynski Date: Sun, 4 Jan 2009 12:00:46 -0800 Subject: vmalloc.c: fix flushing in vmap_page_range() The flush_cache_vmap in vmap_page_range() is called with the end of the range twice. The following patch fixes this for me. Signed-off-by: Adam Lackorzynski Cc: Nick Piggin Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 1ddb77ba399..7465f22fec0 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -151,11 +151,12 @@ static int vmap_pud_range(pgd_t *pgd, unsigned long addr, * * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N] */ -static int vmap_page_range(unsigned long addr, unsigned long end, +static int vmap_page_range(unsigned long start, unsigned long end, pgprot_t prot, struct page **pages) { pgd_t *pgd; unsigned long next; + unsigned long addr = start; int err = 0; int nr = 0; @@ -167,7 +168,7 @@ static int vmap_page_range(unsigned long addr, unsigned long end, if (err) break; } while (pgd++, addr = next, addr != end); - flush_cache_vmap(addr, end); + flush_cache_vmap(start, end); if (unlikely(err)) return err; -- cgit v1.2.3 From 54566b2c1594c2326a645a3551f9d989f7ba3c5e Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Sun, 4 Jan 2009 12:00:53 -0800 Subject: fs: symlink write_begin allocation context fix With the write_begin/write_end aops, page_symlink was broken because it could no longer pass a GFP_NOFS type mask into the point where the allocations happened. They are done in write_begin, which would always assume that the filesystem can be entered from reclaim. This bug could cause filesystem deadlocks. The funny thing with having a gfp_t mask there is that it doesn't really allow the caller to arbitrarily tinker with the context in which it can be called. It couldn't ever be GFP_ATOMIC, for example, because it needs to take the page lock. The only thing any callers care about is __GFP_FS anyway, so turn that into a single flag. Add a new flag for write_begin, AOP_FLAG_NOFS. Filesystems can now act on this flag in their write_begin function. Change __grab_cache_page to accept a nofs argument as well, to honour that flag (while we're there, change the name to grab_cache_page_write_begin which is more instructive and does away with random leading underscores). This is really a more flexible way to go in the end anyway -- if a filesystem happens to want any extra allocations aside from the pagecache ones in ints write_begin function, it may now use GFP_KERNEL (rather than GFP_NOFS) for common case allocations (eg. ocfs2_alloc_write_ctxt, for a random example). [kosaki.motohiro@jp.fujitsu.com: fix ubifs] [kosaki.motohiro@jp.fujitsu.com: fix fuse] Signed-off-by: Nick Piggin Reviewed-by: KOSAKI Motohiro Cc: [2.6.28.x] Signed-off-by: KOSAKI Motohiro Signed-off-by: Andrew Morton [ Cleaned up the calling convention: just pass in the AOP flags untouched to the grab_cache_page_write_begin() function. That just simplifies everybody, and may even allow future expansion of the logic. - Linus ] Signed-off-by: Linus Torvalds --- mm/filemap.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index f3e5f8944d1..f8c69273c37 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2140,19 +2140,24 @@ EXPORT_SYMBOL(generic_file_direct_write); * Find or create a page at the given pagecache position. Return the locked * page. This function is specifically for buffered writes. */ -struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index) +struct page *grab_cache_page_write_begin(struct address_space *mapping, + pgoff_t index, unsigned flags) { int status; struct page *page; + gfp_t gfp_notmask = 0; + if (flags & AOP_FLAG_NOFS) + gfp_notmask = __GFP_FS; repeat: page = find_lock_page(mapping, index); if (likely(page)) return page; - page = page_cache_alloc(mapping); + page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask); if (!page) return NULL; - status = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); + status = add_to_page_cache_lru(page, mapping, index, + GFP_KERNEL & ~gfp_notmask); if (unlikely(status)) { page_cache_release(page); if (status == -EEXIST) @@ -2161,7 +2166,7 @@ repeat: } return page; } -EXPORT_SYMBOL(__grab_cache_page); +EXPORT_SYMBOL(grab_cache_page_write_begin); static ssize_t generic_perform_write(struct file *file, struct iov_iter *i, loff_t pos) -- cgit v1.2.3 From 7f5ff766a7babd72fc192125e12ef5570effff4c Mon Sep 17 00:00:00 2001 From: Dmitri Monakhov Date: Mon, 1 Dec 2008 14:34:56 -0800 Subject: kill suid bit only for regular files We don't have to do it because it is useless for non regular files. In fact block device may trigger this path without dentry->d_inode->i_mutex. (akpm: concerns were expressed (by me) about S_ISDIR inodes) Signed-off-by: Dmitri Monakhov Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index f3e5f8944d1..ed53ce87625 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1766,7 +1766,7 @@ int should_remove_suid(struct dentry *dentry) if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) kill |= ATTR_KILL_SGID; - if (unlikely(kill && !capable(CAP_FSETID))) + if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode))) return kill; return 0; -- cgit v1.2.3 From acfa4380efe77e290d3a96b11cd4c9f24f4fbb18 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 4 Dec 2008 10:06:33 -0500 Subject: inode->i_op is never NULL We used to have rather schizophrenic set of checks for NULL ->i_op even though it had been eliminated years ago. You'd need to go out of your way to set it to NULL explicitly _and_ a bunch of code would die on such inodes anyway. After killing two remaining places that still did that bogosity, all that crap can go away. Signed-off-by: Al Viro --- mm/memory.c | 4 ++-- mm/nommu.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 0a2010a9518..7b9db658aca 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2266,7 +2266,7 @@ int vmtruncate(struct inode * inode, loff_t offset) unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); } - if (inode->i_op && inode->i_op->truncate) + if (inode->i_op->truncate) inode->i_op->truncate(inode); return 0; @@ -2286,7 +2286,7 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) * a way to truncate a range of blocks (punch a hole) - * we should return failure right now. */ - if (!inode->i_op || !inode->i_op->truncate_range) + if (!inode->i_op->truncate_range) return -ENOSYS; mutex_lock(&inode->i_mutex); diff --git a/mm/nommu.c b/mm/nommu.c index 7695dc85078..1c28ea3a4e9 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -86,7 +86,7 @@ do_expand: i_size_write(inode, offset); out_truncate: - if (inode->i_op && inode->i_op->truncate) + if (inode->i_op->truncate) inode->i_op->truncate(inode); return 0; out_sig: -- cgit v1.2.3 From 4c728ef583b3d82266584da5cb068294c09df31e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 22 Dec 2008 21:11:15 +0100 Subject: add a vfs_fsync helper Fsync currently has a fdatawrite/fdatawait pair around the method call, and a mutex_lock/unlock of the inode mutex. All callers of fsync have to duplicate this, but we have a few and most of them don't quite get it right. This patch adds a new vfs_fsync that takes care of this. It's a little more complicated as usual as ->fsync might get a NULL file pointer and just a dentry from nfsd, but otherwise gets afile and we want to take the mapping and file operations from it when it is there. Notes on the fsync callers: - ecryptfs wasn't calling filemap_fdatawrite / filemap_fdatawait on the lower file - coda wasn't calling filemap_fdatawrite / filemap_fdatawait on the host file, and returning 0 when ->fsync was missing - shm wasn't calling either filemap_fdatawrite / filemap_fdatawait nor taking i_mutex. Now given that shared memory doesn't have disk backing not doing anything in fsync seems fine and I left it out of the vfs_fsync conversion for now, but in that case we might just not pass it through to the lower file at all but just call the no-op simple_sync_file directly. [and now actually export vfs_fsync] Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- mm/msync.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/msync.c b/mm/msync.c index 144a7570535..07dae08cf31 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -82,7 +82,7 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags) (vma->vm_flags & VM_SHARED)) { get_file(file); up_read(&mm->mmap_sem); - error = do_fsync(file, 0); + error = vfs_fsync(file, file->f_path.dentry, 0); fput(file); if (error || start >= end) goto out; -- cgit v1.2.3 From 046c68842bce6b77509cf56e94a561029124b0ce Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Mon, 5 Jan 2009 14:06:29 +0000 Subject: mm: update my address Signed-off-by: Alan Cox Signed-off-by: Linus Torvalds --- mm/mmap.c | 2 +- mm/mprotect.c | 2 +- mm/mremap.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index d4855a682ab..2c778fcfd9b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3,7 +3,7 @@ * * Written by obz. * - * Address space accounting code + * Address space accounting code */ #include diff --git a/mm/mprotect.c b/mm/mprotect.c index fded06f923f..cfb4c485206 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -4,7 +4,7 @@ * (C) Copyright 1994 Linus Torvalds * (C) Copyright 2002 Christoph Hellwig * - * Address space accounting code + * Address space accounting code * (C) Copyright 2002 Red Hat Inc, All Rights Reserved */ diff --git a/mm/mremap.c b/mm/mremap.c index 58a2908f42f..646de959aa5 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -3,7 +3,7 @@ * * (C) Copyright 1996 Linus Torvalds * - * Address space accounting code + * Address space accounting code * (C) Copyright 2002 Red Hat Inc, All Rights Reserved */ -- cgit v1.2.3