From 8f72e4028a1ff968000cec4a034f45619fbd7ec4 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 10 Jul 2006 04:44:01 -0700 Subject: [PATCH] fadvise: remove dead comments Cc: "Michael Kerrisk" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/fadvise.c | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'mm') diff --git a/mm/fadvise.c b/mm/fadvise.c index 0a03357a1f8..60a5d55e51d 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -23,18 +23,6 @@ /* * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could * deactivate the pages and clear PG_Referenced. - * - * LINUX_FADV_ASYNC_WRITE: start async writeout of any dirty pages between file - * offsets `offset' and `offset+len' inclusive. Any pages which are currently - * under writeout are skipped, whether or not they are dirty. - * - * LINUX_FADV_WRITE_WAIT: wait upon writeout of any dirty pages between file - * offsets `offset' and `offset+len'. - * - * By combining these two operations the application may do several things: - * - * LINUX_FADV_ASYNC_WRITE: push some or all of the dirty pages at the disk. - * */ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) { -- cgit v1.2.3 From 6d46cc6b9b04dc28a9c5db62db791aeec8ab2ea5 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Mon, 10 Jul 2006 04:44:21 -0700 Subject: [PATCH] mm/bootmem.c: EXPORT_UNUSED_SYMBOL This patch marks an unused export as EXPORT_UNUSED_SYMBOL. Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index d213feded10..50353e0dac1 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -29,9 +29,7 @@ unsigned long max_low_pfn; unsigned long min_low_pfn; unsigned long max_pfn; -EXPORT_SYMBOL(max_pfn); /* This is exported so - * dma_get_required_mask(), which uses - * it, can be an inline function */ +EXPORT_UNUSED_SYMBOL(max_pfn); /* June 2006 */ static LIST_HEAD(bdata_list); #ifdef CONFIG_CRASH_DUMP -- cgit v1.2.3 From 26fc52367af3774b123334bca409159ce37d2857 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Mon, 10 Jul 2006 04:44:22 -0700 Subject: [PATCH] mm/memory.c: EXPORT_UNUSED_SYMBOL This patch marks an unused export as EXPORT_UNUSED_SYMBOL. Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index c1e14c9e67e..dc0d82cf2a1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1853,7 +1853,7 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) return 0; } -EXPORT_SYMBOL(vmtruncate_range); +EXPORT_UNUSED_SYMBOL(vmtruncate_range); /* June 2006 */ /* * Primitive swap readahead code. We simply read an aligned block of -- cgit v1.2.3 From b0d85c5c3009d292fe195f666cbbec7da47dabf4 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Mon, 10 Jul 2006 04:44:23 -0700 Subject: [PATCH] mm/mmzone.c: EXPORT_UNUSED_SYMBOL This patch marks three unused exports as EXPORT_UNUSED_SYMBOL. Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmzone.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/mmzone.c b/mm/mmzone.c index 0959ee1a479..febea1c9816 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -14,7 +14,7 @@ struct pglist_data *first_online_pgdat(void) return NODE_DATA(first_online_node); } -EXPORT_SYMBOL(first_online_pgdat); +EXPORT_UNUSED_SYMBOL(first_online_pgdat); /* June 2006 */ struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) { @@ -24,7 +24,7 @@ struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) return NULL; return NODE_DATA(nid); } -EXPORT_SYMBOL(next_online_pgdat); +EXPORT_UNUSED_SYMBOL(next_online_pgdat); /* June 2006 */ /* @@ -45,5 +45,5 @@ struct zone *next_zone(struct zone *zone) } return zone; } -EXPORT_SYMBOL(next_zone); +EXPORT_UNUSED_SYMBOL(next_zone); /* June 2006 */ -- cgit v1.2.3 From 32dd66fce3b0ad5857433433b795844cb397608e Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 10 Jul 2006 04:44:31 -0700 Subject: [PATCH] vmstat: export all_vm_events() Add missing EXPORT_SYMBOL for all_vm_events(). Git commit f8891e5e1f93a128c3900f82035e8541357896a7 caused this: Building modules, stage 2. MODPOST WARNING: "all_vm_events" [arch/s390/appldata/appldata_mem.ko] undefined! CC arch/s390/appldata/appldata_mem.mod.o Cc: Christoph Lameter Cc: Gerald Schaefer Cc: Martin Schwidefsky Signed-off-by: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index 73b83d67bab..dfdf2413390 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -81,6 +81,7 @@ void all_vm_events(unsigned long *ret) { sum_vm_events(ret, &cpu_online_map); } +EXPORT_SYMBOL_GPL(all_vm_events); #ifdef CONFIG_HOTPLUG /* -- cgit v1.2.3 From 873623dfabaa6ebbdc1ce16c1766a3c0ec5d9923 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 13 Jul 2006 14:44:38 +0200 Subject: [PATCH] lockdep: undo mm/slab.c annotation undo existing mm/slab.c lock-validator annotations, in preparation of a new, less intrusive annotation patch. Signed-off-by: Ingo Molnar Signed-off-by: Arjan van de Ven Signed-off-by: Linus Torvalds --- mm/slab.c | 33 ++++++++++----------------------- 1 file changed, 10 insertions(+), 23 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 85c2e03098a..fd1e4c4c139 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1021,8 +1021,7 @@ static void drain_alien_cache(struct kmem_cache *cachep, } } -static inline int cache_free_alien(struct kmem_cache *cachep, void *objp, - int nesting) +static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) { struct slab *slabp = virt_to_slab(objp); int nodeid = slabp->nodeid; @@ -1040,7 +1039,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp, STATS_INC_NODEFREES(cachep); if (l3->alien && l3->alien[nodeid]) { alien = l3->alien[nodeid]; - spin_lock_nested(&alien->lock, nesting); + spin_lock(&alien->lock); if (unlikely(alien->avail == alien->limit)) { STATS_INC_ACOVERFLOW(cachep); __drain_alien_cache(cachep, alien, nodeid); @@ -1069,8 +1068,7 @@ static inline void free_alien_cache(struct array_cache **ac_ptr) { } -static inline int cache_free_alien(struct kmem_cache *cachep, void *objp, - int nesting) +static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) { return 0; } @@ -1760,8 +1758,6 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) } #endif -static void __cache_free(struct kmem_cache *cachep, void *objp, int nesting); - /** * slab_destroy - destroy and release all objects in a slab * @cachep: cache pointer being destroyed @@ -1785,17 +1781,8 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) call_rcu(&slab_rcu->head, kmem_rcu_free); } else { kmem_freepages(cachep, addr); - if (OFF_SLAB(cachep)) { - unsigned long flags; - - /* - * lockdep: we may nest inside an already held - * ac->lock, so pass in a nesting flag: - */ - local_irq_save(flags); - __cache_free(cachep->slabp_cache, slabp, 1); - local_irq_restore(flags); - } + if (OFF_SLAB(cachep)) + kmem_cache_free(cachep->slabp_cache, slabp); } } @@ -3135,7 +3122,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) #endif check_irq_off(); l3 = cachep->nodelists[node]; - spin_lock_nested(&l3->list_lock, SINGLE_DEPTH_NESTING); + spin_lock(&l3->list_lock); if (l3->shared) { struct array_cache *shared_array = l3->shared; int max = shared_array->limit - shared_array->avail; @@ -3178,14 +3165,14 @@ free_done: * Release an obj back to its cache. If the obj has a constructed state, it must * be in this state _before_ it is released. Called with disabled ints. */ -static void __cache_free(struct kmem_cache *cachep, void *objp, int nesting) +static inline void __cache_free(struct kmem_cache *cachep, void *objp) { struct array_cache *ac = cpu_cache_get(cachep); check_irq_off(); objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); - if (cache_free_alien(cachep, objp, nesting)) + if (cache_free_alien(cachep, objp)) return; if (likely(ac->avail < ac->limit)) { @@ -3424,7 +3411,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp) BUG_ON(virt_to_cache(objp) != cachep); local_irq_save(flags); - __cache_free(cachep, objp, 0); + __cache_free(cachep, objp); local_irq_restore(flags); } EXPORT_SYMBOL(kmem_cache_free); @@ -3449,7 +3436,7 @@ void kfree(const void *objp) kfree_debugcheck(objp); c = virt_to_cache(objp); debug_check_no_locks_freed(objp, obj_size(c)); - __cache_free(c, (void *)objp, 0); + __cache_free(c, (void *)objp); local_irq_restore(flags); } EXPORT_SYMBOL(kfree); -- cgit v1.2.3 From f1aaee53f2877a7afa55e8245c241ff60a86367d Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Thu, 13 Jul 2006 14:46:03 +0200 Subject: [PATCH] lockdep: annotate mm/slab.c mm/slab.c uses nested locking when dealing with 'off-slab' caches, in that case it allocates the slab header from the (on-slab) kmalloc caches. Teach the lock validator about this by putting all on-slab caches into a separate class. this patch has no effect on non-lockdep kernels. Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar Signed-off-by: Linus Torvalds --- mm/slab.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index fd1e4c4c139..5a57cda7490 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -674,6 +674,37 @@ static struct kmem_cache cache_cache = { #endif }; +#ifdef CONFIG_LOCKDEP + +/* + * Slab sometimes uses the kmalloc slabs to store the slab headers + * for other slabs "off slab". + * The locking for this is tricky in that it nests within the locks + * of all other slabs in a few places; to deal with this special + * locking we put on-slab caches into a separate lock-class. + */ +static struct lock_class_key on_slab_key; + +static inline void init_lock_keys(struct cache_sizes *s) +{ + int q; + + for (q = 0; q < MAX_NUMNODES; q++) { + if (!s->cs_cachep->nodelists[q] || OFF_SLAB(s->cs_cachep)) + continue; + lockdep_set_class(&s->cs_cachep->nodelists[q]->list_lock, + &on_slab_key); + } +} + +#else +static inline void init_lock_keys(struct cache_sizes *s) +{ +} +#endif + + + /* Guard access to the cache-chain. */ static DEFINE_MUTEX(cache_chain_mutex); static struct list_head cache_chain; @@ -1391,6 +1422,7 @@ void __init kmem_cache_init(void) ARCH_KMALLOC_FLAGS|SLAB_PANIC, NULL, NULL); } + init_lock_keys(sizes); sizes->cs_dmacachep = kmem_cache_create(names->name_dma, sizes->cs_size, -- cgit v1.2.3 From fc818301a8a39fedd7f0a71f878f29130c72193d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 13 Jul 2006 09:12:21 +0200 Subject: [PATCH] revert slab.c locking change Chandra Seetharaman reported SLAB crashes caused by the slab.c lock annotation patch. There is only one chunk of that patch that has a material effect on the slab logic - this patch undoes that chunk. This was confirmed to fix the slab problem by Chandra. Signed-off-by: Ingo Molnar Tested-by: Chandra Seetharaman Signed-off-by: Linus Torvalds --- mm/slab.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 5a57cda7490..0f20843beff 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3119,16 +3119,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, if (slabp->inuse == 0) { if (l3->free_objects > l3->free_limit) { l3->free_objects -= cachep->num; - /* - * It is safe to drop the lock. The slab is - * no longer linked to the cache. cachep - * cannot disappear - we are using it and - * all destruction of caches must be - * serialized properly by the user. - */ - spin_unlock(&l3->list_lock); slab_destroy(cachep, slabp); - spin_lock(&l3->list_lock); } else { list_add(&slabp->list, &l3->slabs_free); } -- cgit v1.2.3 From 8757d5fa6b75e8ea906baf0309d49b980e7f9bc9 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Fri, 14 Jul 2006 00:23:56 -0700 Subject: [PATCH] mm: fix oom roll-back of __vmalloc_area_node __vunmap must not rely on area->nr_pages when picking the release methode for area->pages. It may be too small when __vmalloc_area_node failed early due to lacking memory. Instead, use a flag in vmstruct to differentiate. Signed-off-by: Jan Kiszka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 7b450798b45..266162d2ba2 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -340,7 +340,7 @@ void __vunmap(void *addr, int deallocate_pages) __free_page(area->pages[i]); } - if (area->nr_pages > PAGE_SIZE/sizeof(struct page *)) + if (area->flags & VM_VPAGES) vfree(area->pages); else kfree(area->pages); @@ -427,9 +427,10 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, area->nr_pages = nr_pages; /* Please note that the recursion is strictly bounded. */ - if (array_size > PAGE_SIZE) + if (array_size > PAGE_SIZE) { pages = __vmalloc_node(array_size, gfp_mask, PAGE_KERNEL, node); - else + area->flags |= VM_VPAGES; + } else pages = kmalloc_node(array_size, (gfp_mask & ~__GFP_HIGHMEM), node); area->pages = pages; if (!area->pages) { -- cgit v1.2.3 From c38c8db7225465c8d124f38b24d3024decc26bbd Mon Sep 17 00:00:00 2001 From: Anil Keshavamurthy Date: Fri, 14 Jul 2006 00:23:57 -0700 Subject: [PATCH] ia64: race flushing icache in COW path There is a race condition that showed up in a threaded JIT environment. The situation is that a process with a JIT code page forks, so the page is marked read-only, then some threads are created in the child. One of the threads attempts to add a new code block to the JIT page, so a copy-on-write fault is taken, and the kernel allocates a new page, copies the data, installs the new pte, and then calls lazy_mmu_prot_update() to flush caches to make sure that the icache and dcache are in sync. Unfortunately, the other thread runs right after the new pte is installed, but before the caches have been flushed. It tries to execute some old JIT code that was already in this page, but it sees some garbage in the i-cache from the previous users of the new physical page. Fix: we must make the caches consistent before installing the pte. This is an ia64 only fix because lazy_mmu_prot_update() is a no-op on all other architectures. Signed-off-by: Anil Keshavamurthy Signed-off-by: Tony Luck Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index dc0d82cf2a1..de8bc85dc8f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1549,9 +1549,9 @@ gotten: flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); + lazy_mmu_prot_update(entry); ptep_establish(vma, address, page_table, entry); update_mmu_cache(vma, address, entry); - lazy_mmu_prot_update(entry); lru_cache_add_active(new_page); page_add_new_anon_rmap(new_page, vma, address); -- cgit v1.2.3 From 22c4af4092fc2e037ce2e2922023fc222cf0c443 Mon Sep 17 00:00:00 2001 From: Luke Yang Date: Fri, 14 Jul 2006 00:24:09 -0700 Subject: [PATCH] nommu: export two symbols for drivers to use nommu.c needs to export two more symbols for drivers to use: remap_pfn_range and unmap_mapping_range. Signed-off-by: Luke Yang Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/nommu.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/nommu.c b/mm/nommu.c index 5151c44a825..c576df71e3b 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1070,6 +1070,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long from, vma->vm_start = vma->vm_pgoff << PAGE_SHIFT; return 0; } +EXPORT_SYMBOL(remap_pfn_range); void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) { @@ -1090,6 +1091,7 @@ void unmap_mapping_range(struct address_space *mapping, int even_cows) { } +EXPORT_SYMBOL(unmap_mapping_range); /* * Check that a process has enough memory to allocate a new virtual -- cgit v1.2.3 From 0ff922452df86f3e9a2c6f705c4588ec62d096a7 Mon Sep 17 00:00:00 2001 From: Shailabh Nagar Date: Fri, 14 Jul 2006 00:24:37 -0700 Subject: [PATCH] per-task-delay-accounting: sync block I/O and swapin delay collection Unlike earlier iterations of the delay accounting patches, now delays are only collected for the actual I/O waits rather than try and cover the delays seen in I/O submission paths. Account separately for block I/O delays incurred as a result of swapin page faults whose frequency can be affected by the task/process' rss limit. Hence swapin delays can act as feedback for rss limit changes independent of I/O priority changes. Signed-off-by: Shailabh Nagar Signed-off-by: Balbir Singh Cc: Jes Sorensen Cc: Peter Chubb Cc: Erich Focht Cc: Levent Serinol Cc: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index de8bc85dc8f..109e9866237 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -1934,6 +1935,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, migration_entry_wait(mm, pmd, address); goto out; } + delayacct_set_flag(DELAYACCT_PF_SWAPIN); page = lookup_swap_cache(entry); if (!page) { swapin_readahead(entry, address, vma); @@ -1946,6 +1948,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (likely(pte_same(*page_table, orig_pte))) ret = VM_FAULT_OOM; + delayacct_clear_flag(DELAYACCT_PF_SWAPIN); goto unlock; } @@ -1955,6 +1958,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, grab_swap_token(); } + delayacct_clear_flag(DELAYACCT_PF_SWAPIN); mark_page_accessed(page); lock_page(page); -- cgit v1.2.3 From b83a8e64fd1aecf021111d22c062c97a3241d0c4 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Sat, 29 Jul 2006 21:42:55 +0200 Subject: [PATCH] MM: Remove rogue readahead printk For some reason it triggers always with NFS root and spams the kernel logs of my nfs root boxes a lot. Signed-off-by: Andi Kleen Acked-by: Trond Myklebust Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index d087fc3d328..b9a60c43b61 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -849,8 +849,6 @@ static void shrink_readahead_size_eio(struct file *filp, return; ra->ra_pages /= 4; - printk(KERN_WARNING "Reducing readahead size to %luK\n", - ra->ra_pages << (PAGE_CACHE_SHIFT - 10)); } /** -- cgit v1.2.3 From 8c78f3075dab4be279e283f901f00e33ce44890a Mon Sep 17 00:00:00 2001 From: Chandra Seetharaman Date: Sun, 30 Jul 2006 03:03:35 -0700 Subject: [PATCH] cpu hotplug: replace __devinit* with __cpuinit* for cpu notifications Few of the callback functions and notifier blocks that are associated with cpu notifications incorrectly have __devinit and __devinitdata. They should be __cpuinit and __cpuinitdata instead. It makes no functional difference but wastes text area when CONFIG_HOTPLUG is enabled and CONFIG_HOTPLUG_CPU is not. This patch fixes all those instances. Signed-off-by: Chandra Seetharaman Cc: Ashok Raj Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 0f20843beff..252972ff649 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1106,7 +1106,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) #endif -static int __devinit cpuup_callback(struct notifier_block *nfb, +static int __cpuinit cpuup_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { long cpu = (long)hcpu; -- cgit v1.2.3 From b8008b2bc21fb13b45964e21247f18c013d6e985 Mon Sep 17 00:00:00 2001 From: Rolf Eike Beer Date: Sun, 30 Jul 2006 03:04:04 -0700 Subject: [PATCH] Fix kmem_cache_alloc() been documented twice kmem_cache_alloc() was documented twice, but kmem_cache_zalloc() never. Fix this obvious typo to get things right. Signed-off-by: Rolf Eike Beer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 252972ff649..21ba0603570 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3224,7 +3224,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) EXPORT_SYMBOL(kmem_cache_alloc); /** - * kmem_cache_alloc - Allocate an object. The memory is set to zero. + * kmem_cache_zalloc - Allocate an object. The memory is set to zero. * @cache: The cache to allocate from. * @flags: See kmalloc(). * -- cgit v1.2.3 From 60c371bc753495f36d3a71338b46030f7fffce3b Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 5 Aug 2006 12:14:25 -0700 Subject: [PATCH] fadvise() make POSIX_FADV_NOREUSE a no-op The POSIX_FADV_NOREUSE hint means "the application will use this range of the file a single time". It seems to be intended that the implementation will use this hint to perform drop-behind of that part of the file when the application gets around to reading or writing it. However for reasons which aren't obvious (or sane?) I mapped POSIX_FADV_NOREUSE onto POSIX_FADV_WILLNEED. ie: it does readahead. That's daft. So for now, make POSIX_FADV_NOREUSE a no-op. This is a non-back-compatible change. If someone was using POSIX_FADV_NOREUSE to perform readahead, they lose. The likelihood is low. If/when we later implement POSIX_FADV_NOREUSE things will get interesting - to do it fully we'll need to maintain file offset/length ranges and peform all sorts of complex tricks, and managing the lifetime of those ranges' data structures will be interesting.. A sensible implementation would probably ignore the file range and would simply mark the entire file as needing some form of drop-behind treatment. Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/fadvise.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/fadvise.c b/mm/fadvise.c index 60a5d55e51d..168c78a121b 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -73,7 +73,6 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) file->f_ra.ra_pages = bdi->ra_pages * 2; break; case POSIX_FADV_WILLNEED: - case POSIX_FADV_NOREUSE: if (!mapping->a_ops->readpage) { ret = -EINVAL; break; @@ -94,6 +93,8 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) if (ret > 0) ret = 0; break; + case POSIX_FADV_NOREUSE: + break; case POSIX_FADV_DONTNEED: if (!bdi_write_congested(mapping->backing_dev_info)) filemap_flush(mapping); -- cgit v1.2.3 From 6f712711dbd180aa3777efe5ae3b9b0e915b9471 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Sat, 5 Aug 2006 12:14:58 -0700 Subject: [PATCH] memory hotadd fixes: not-aligned memory hotadd handling fix ioresouce handling code in memory hotplug allows not-aligned memory hot add. But when memmap and other memory structures are initialized, parameters should be aligned. (if not aligned, initialization of mem_map will do wrong, it assumes parameters are aligned.) This patch fix it. And this patch allows ioresource collision check to handle -EEXIST. Signed-off-by: KAMEZAWA Hiroyuki Cc: Keith Mannthey Cc: Yasunori Goto Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 01c9fb97c61..7d25cc12235 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -76,15 +76,22 @@ int __add_pages(struct zone *zone, unsigned long phys_start_pfn, { unsigned long i; int err = 0; + int start_sec, end_sec; + /* during initialize mem_map, align hot-added range to section */ + start_sec = pfn_to_section_nr(phys_start_pfn); + end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); - for (i = 0; i < nr_pages; i += PAGES_PER_SECTION) { - err = __add_section(zone, phys_start_pfn + i); + for (i = start_sec; i <= end_sec; i++) { + err = __add_section(zone, i << PFN_SECTION_SHIFT); - /* We want to keep adding the rest of the - * sections if the first ones already exist + /* + * EEXIST is finally dealed with by ioresource collision + * check. see add_memory() => register_memory_resource() + * Warning will be printed if there is collision. */ if (err && (err != -EEXIST)) break; + err = 0; } return err; @@ -213,10 +220,10 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat) } /* add this memory to iomem resource */ -static void register_memory_resource(u64 start, u64 size) +static int register_memory_resource(u64 start, u64 size) { struct resource *res; - + int ret = 0; res = kzalloc(sizeof(struct resource), GFP_KERNEL); BUG_ON(!res); @@ -228,7 +235,9 @@ static void register_memory_resource(u64 start, u64 size) printk("System RAM resource %llx - %llx cannot be added\n", (unsigned long long)res->start, (unsigned long long)res->end); kfree(res); + ret = -EEXIST; } + return ret; } @@ -269,7 +278,7 @@ int add_memory(int nid, u64 start, u64 size) } /* register this memory as resource */ - register_memory_resource(start, size); + ret = register_memory_resource(start, size); return ret; error: -- cgit v1.2.3 From 58c1b5b079071d82b2f924000b7e8fb5585ce7d8 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Sat, 5 Aug 2006 12:15:01 -0700 Subject: [PATCH] memory hotadd fixes: find_next_system_ram catch range fix find_next_system_ram() is used to find available memory resource at onlining newly added memory. This patch fixes following problem. find_next_system_ram() cannot catch this case. Resource: (start)-------------(end) Section : (start)-------------(end) Signed-off-by: KAMEZAWA Hiroyuki Cc: Keith Mannthey Cc: Yasunori Goto Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 7d25cc12235..26f1840879d 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -163,7 +163,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) res.flags = IORESOURCE_MEM; /* we just need system ram */ section_end = res.end; - while (find_next_system_ram(&res) >= 0) { + while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) { start_pfn = (unsigned long)(res.start >> PAGE_SHIFT); nr_pages = (unsigned long) ((res.end + 1 - res.start) >> PAGE_SHIFT); -- cgit v1.2.3 From ebd15302dc0ba1b8761600c20854f5371e7bae1e Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Sat, 5 Aug 2006 12:15:06 -0700 Subject: [PATCH] memory hotadd fixes: enhance collision check This patch is for collision check enhancement for memory hot add. It's better to do resouce collision check before doing memory hot add, which will touch memory management structures. And add_section() should check section exists or not before calling sparse_add_one_section(). (sparse_add_one_section() will do another check anyway. but checking in memory_hotplug.c will be easy to understand.) Signed-off-by: KAMEZAWA Hiroyuki Cc: keith mannthey Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 26f1840879d..c37319542b7 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -52,6 +52,9 @@ static int __add_section(struct zone *zone, unsigned long phys_start_pfn) int nr_pages = PAGES_PER_SECTION; int ret; + if (pfn_valid(phys_start_pfn)) + return -EEXIST; + ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages); if (ret < 0) @@ -220,10 +223,9 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat) } /* add this memory to iomem resource */ -static int register_memory_resource(u64 start, u64 size) +static struct resource *register_memory_resource(u64 start, u64 size) { struct resource *res; - int ret = 0; res = kzalloc(sizeof(struct resource), GFP_KERNEL); BUG_ON(!res); @@ -235,9 +237,18 @@ static int register_memory_resource(u64 start, u64 size) printk("System RAM resource %llx - %llx cannot be added\n", (unsigned long long)res->start, (unsigned long long)res->end); kfree(res); - ret = -EEXIST; + res = NULL; } - return ret; + return res; +} + +static void release_memory_resource(struct resource *res) +{ + if (!res) + return; + release_resource(res); + kfree(res); + return; } @@ -246,8 +257,13 @@ int add_memory(int nid, u64 start, u64 size) { pg_data_t *pgdat = NULL; int new_pgdat = 0; + struct resource *res; int ret; + res = register_memory_resource(start, size); + if (!res) + return -EEXIST; + if (!node_online(nid)) { pgdat = hotadd_new_pgdat(nid, start); if (!pgdat) @@ -277,14 +293,13 @@ int add_memory(int nid, u64 start, u64 size) BUG_ON(ret); } - /* register this memory as resource */ - ret = register_memory_resource(start, size); - return ret; error: /* rollback pgdat allocation and others */ if (new_pgdat) rollback_node_hotadd(nid, pgdat); + if (res) + release_memory_resource(res); return ret; } -- cgit v1.2.3 From 1d7ea7324ae7a59f8e17e4ba76a2707c1e6f24d2 Mon Sep 17 00:00:00 2001 From: Alexander Zarochentsev Date: Sun, 13 Aug 2006 23:24:27 -0700 Subject: [PATCH] fuse: fix error case in fuse_readpages Don't let fuse_readpages leave the @pages list not empty when exiting on error. [akpm@osdl.org: kernel-doc fixes] Signed-off-by: Alexander Zarochentsev Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- mm/swap.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index 8fd095c4ae5..687686a61f7 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -54,6 +54,26 @@ void put_page(struct page *page) } EXPORT_SYMBOL(put_page); +/** + * put_pages_list(): release a list of pages + * + * Release a list of pages which are strung together on page.lru. Currently + * used by read_cache_pages() and related error recovery code. + * + * @pages: list of pages threaded on page->lru + */ +void put_pages_list(struct list_head *pages) +{ + while (!list_empty(pages)) { + struct page *victim; + + victim = list_entry(pages->prev, struct page, lru); + list_del(&victim->lru); + page_cache_release(victim); + } +} +EXPORT_SYMBOL(put_pages_list); + /* * Writeback is about to end against a page which has been marked for immediate * reclaim. If it still appears to be reclaimable, move it to the tail of the -- cgit v1.2.3 From b6b5bce3571e496504a89ee575d32101e0a98b93 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 27 Aug 2006 01:23:25 -0700 Subject: [PATCH] swsusp: Fix swap_type_of There is a bug in mm/swapfile.c#swap_type_of() that makes swsusp only be able to use the first active swap partition as the resume device. Fix it. Signed-off-by: Rafael J. Wysocki Cc: Hugh Dickins Acked-by: Pavel Machek Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index e70d6c6d6fe..f1f5ec78378 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -442,11 +442,12 @@ int swap_type_of(dev_t device) if (!(swap_info[i].flags & SWP_WRITEOK)) continue; + if (!device) { spin_unlock(&swap_lock); return i; } - inode = swap_info->swap_file->f_dentry->d_inode; + inode = swap_info[i].swap_file->f_dentry->d_inode; if (S_ISBLK(inode->i_mode) && device == MKDEV(imajor(inode), iminor(inode))) { spin_unlock(&swap_lock); -- cgit v1.2.3 From a302eb4e4602d6444ae75a0e516fb2f2c62d6642 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 31 Aug 2006 21:27:34 -0700 Subject: [PATCH] ZVC: Overstep counters Increments and decrements are usually grouped rather than mixed. We can optimize the inc and dec functions for that case. Increment and decrement the counters by 50% more than the threshold in those cases and set the differential accordingly. This decreases the need to update the atomic counters. The idea came originally from Andrew Morton. The overstepping alone was sufficient to address the contention issue found when updating the global and the per zone counters from 160 processors. Also remove some code in dec_zone_page_state. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index dfdf2413390..3799a0f7543 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -190,8 +190,8 @@ static void __inc_zone_state(struct zone *zone, enum zone_stat_item item) (*p)++; if (unlikely(*p > STAT_THRESHOLD)) { - zone_page_state_add(*p, zone, item); - *p = 0; + zone_page_state_add(*p + STAT_THRESHOLD / 2, zone, item); + *p = -STAT_THRESHOLD / 2; } } @@ -209,8 +209,8 @@ void __dec_zone_page_state(struct page *page, enum zone_stat_item item) (*p)--; if (unlikely(*p < -STAT_THRESHOLD)) { - zone_page_state_add(*p, zone, item); - *p = 0; + zone_page_state_add(*p - STAT_THRESHOLD / 2, zone, item); + *p = STAT_THRESHOLD /2; } } EXPORT_SYMBOL(__dec_zone_page_state); @@ -239,19 +239,9 @@ EXPORT_SYMBOL(inc_zone_page_state); void dec_zone_page_state(struct page *page, enum zone_stat_item item) { unsigned long flags; - struct zone *zone; - s8 *p; - zone = page_zone(page); local_irq_save(flags); - p = diff_pointer(zone, item); - - (*p)--; - - if (unlikely(*p < -STAT_THRESHOLD)) { - zone_page_state_add(*p, zone, item); - *p = 0; - } + __dec_zone_page_state(page, item); local_irq_restore(flags); } EXPORT_SYMBOL(dec_zone_page_state); -- cgit v1.2.3 From df9ecaba3f152d1ea79f2a5e0b87505e03f47590 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 31 Aug 2006 21:27:35 -0700 Subject: [PATCH] ZVC: Scale thresholds depending on the size of the system The ZVC counter update threshold is currently set to a fixed value of 32. This patch sets up the threshold depending on the number of processors and the sizes of the zones in the system. With the current threshold of 32, I was able to observe slight contention when more than 130-140 processors concurrently updated the counters. The contention vanished when I either increased the threshold to 64 or used Andrew's idea of overstepping the interval (see ZVC overstep patch). However, we saw contention again at 220-230 processors. So we need higher values for larger systems. But the current default is already a bit of an overkill for smaller systems. Some systems have tiny zones where precision matters. For example i386 and x86_64 have 16M DMA zones and either 900M ZONE_NORMAL or ZONE_DMA32. These are even present on SMP and NUMA systems. The patch here sets up a threshold based on the number of processors in the system and the size of the zone that these counters are used for. The threshold should grow logarithmically, so we use fls() as an easy approximation. Results of tests on a system with 1024 processors (4TB RAM) The following output is from a test allocating 1GB of memory concurrently on each processor (Forking the process. So contention on mmap_sem and the pte locks is not a factor): X MIN TYPE: CPUS WALL WALL SYS USER TOTCPU fork 1 0.552 0.552 0.540 0.012 0.552 fork 4 0.552 0.548 2.164 0.036 2.200 fork 16 0.564 0.548 8.812 0.164 8.976 fork 128 0.580 0.572 72.204 1.208 73.412 fork 256 1.300 0.660 310.400 2.160 312.560 fork 512 3.512 0.696 1526.836 4.816 1531.652 fork 1020 20.024 0.700 17243.176 6.688 17249.863 So a threshold of 32 is fine up to 128 processors. At 256 processors contention becomes a factor. Overstepping the counter (earlier patch) improves the numbers a bit: fork 4 0.552 0.548 2.164 0.040 2.204 fork 16 0.552 0.548 8.640 0.148 8.788 fork 128 0.556 0.548 69.676 0.956 70.632 fork 256 0.876 0.636 212.468 2.108 214.576 fork 512 2.276 0.672 997.324 4.260 1001.584 fork 1020 13.564 0.680 11586.436 6.088 11592.523 Still contention at 512 and 1020. Contention at 1020 is down by a third. 256 still has a slight bit of contention. After this patch the counter threshold will be set to 125 which reduces contention significantly: fork 128 0.560 0.548 69.776 0.932 70.708 fork 256 0.636 0.556 143.460 2.036 145.496 fork 512 0.640 0.548 284.244 4.236 288.480 fork 1020 1.500 0.588 1326.152 8.892 1335.044 [akpm@osdl.org: !SMP build fix] Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 139 +++++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 119 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index 3799a0f7543..c1b5f4106b3 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -12,6 +12,7 @@ #include #include #include +#include void __get_zone_counts(unsigned long *active, unsigned long *inactive, unsigned long *free, struct pglist_data *pgdat) @@ -114,17 +115,72 @@ EXPORT_SYMBOL(vm_stat); #ifdef CONFIG_SMP -#define STAT_THRESHOLD 32 +static int calculate_threshold(struct zone *zone) +{ + int threshold; + int mem; /* memory in 128 MB units */ + + /* + * The threshold scales with the number of processors and the amount + * of memory per zone. More memory means that we can defer updates for + * longer, more processors could lead to more contention. + * fls() is used to have a cheap way of logarithmic scaling. + * + * Some sample thresholds: + * + * Threshold Processors (fls) Zonesize fls(mem+1) + * ------------------------------------------------------------------ + * 8 1 1 0.9-1 GB 4 + * 16 2 2 0.9-1 GB 4 + * 20 2 2 1-2 GB 5 + * 24 2 2 2-4 GB 6 + * 28 2 2 4-8 GB 7 + * 32 2 2 8-16 GB 8 + * 4 2 2 <128M 1 + * 30 4 3 2-4 GB 5 + * 48 4 3 8-16 GB 8 + * 32 8 4 1-2 GB 4 + * 32 8 4 0.9-1GB 4 + * 10 16 5 <128M 1 + * 40 16 5 900M 4 + * 70 64 7 2-4 GB 5 + * 84 64 7 4-8 GB 6 + * 108 512 9 4-8 GB 6 + * 125 1024 10 8-16 GB 8 + * 125 1024 10 16-32 GB 9 + */ + + mem = zone->present_pages >> (27 - PAGE_SHIFT); + + threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem)); + + /* + * Maximum threshold is 125 + */ + threshold = min(125, threshold); + + return threshold; +} /* - * Determine pointer to currently valid differential byte given a zone and - * the item number. - * - * Preemption must be off + * Refresh the thresholds for each zone. */ -static inline s8 *diff_pointer(struct zone *zone, enum zone_stat_item item) +static void refresh_zone_stat_thresholds(void) { - return &zone_pcp(zone, smp_processor_id())->vm_stat_diff[item]; + struct zone *zone; + int cpu; + int threshold; + + for_each_zone(zone) { + + if (!zone->present_pages) + continue; + + threshold = calculate_threshold(zone); + + for_each_online_cpu(cpu) + zone_pcp(zone, cpu)->stat_threshold = threshold; + } } /* @@ -133,17 +189,16 @@ static inline s8 *diff_pointer(struct zone *zone, enum zone_stat_item item) void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, int delta) { - s8 *p; + struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); + s8 *p = pcp->vm_stat_diff + item; long x; - p = diff_pointer(zone, item); x = delta + *p; - if (unlikely(x > STAT_THRESHOLD || x < -STAT_THRESHOLD)) { + if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) { zone_page_state_add(x, zone, item); x = 0; } - *p = x; } EXPORT_SYMBOL(__mod_zone_page_state); @@ -172,10 +227,12 @@ EXPORT_SYMBOL(mod_zone_page_state); * No overflow check is necessary and therefore the differential can be * incremented or decremented in place which may allow the compilers to * generate better code. - * * The increment or decrement is known and therefore one boundary check can * be omitted. * + * NOTE: These functions are very performance sensitive. Change only + * with care. + * * Some processors have inc/dec instructions that are atomic vs an interrupt. * However, the code must first determine the differential location in a zone * based on the processor number and then inc/dec the counter. There is no @@ -185,13 +242,16 @@ EXPORT_SYMBOL(mod_zone_page_state); */ static void __inc_zone_state(struct zone *zone, enum zone_stat_item item) { - s8 *p = diff_pointer(zone, item); + struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); + s8 *p = pcp->vm_stat_diff + item; (*p)++; - if (unlikely(*p > STAT_THRESHOLD)) { - zone_page_state_add(*p + STAT_THRESHOLD / 2, zone, item); - *p = -STAT_THRESHOLD / 2; + if (unlikely(*p > pcp->stat_threshold)) { + int overstep = pcp->stat_threshold / 2; + + zone_page_state_add(*p + overstep, zone, item); + *p = -overstep; } } @@ -204,13 +264,16 @@ EXPORT_SYMBOL(__inc_zone_page_state); void __dec_zone_page_state(struct page *page, enum zone_stat_item item) { struct zone *zone = page_zone(page); - s8 *p = diff_pointer(zone, item); + struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); + s8 *p = pcp->vm_stat_diff + item; (*p)--; - if (unlikely(*p < -STAT_THRESHOLD)) { - zone_page_state_add(*p - STAT_THRESHOLD / 2, zone, item); - *p = STAT_THRESHOLD /2; + if (unlikely(*p < - pcp->stat_threshold)) { + int overstep = pcp->stat_threshold / 2; + + zone_page_state_add(*p - overstep, zone, item); + *p = overstep; } } EXPORT_SYMBOL(__dec_zone_page_state); @@ -515,6 +578,10 @@ static int zoneinfo_show(struct seq_file *m, void *arg) pageset->pcp[j].high, pageset->pcp[j].batch); } +#ifdef CONFIG_SMP + seq_printf(m, "\n vm stats threshold: %d", + pageset->stat_threshold); +#endif } seq_printf(m, "\n all_unreclaimable: %u" @@ -603,3 +670,35 @@ struct seq_operations vmstat_op = { #endif /* CONFIG_PROC_FS */ +#ifdef CONFIG_SMP +/* + * Use the cpu notifier to insure that the thresholds are recalculated + * when necessary. + */ +static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_CANCELED: + case CPU_DEAD: + refresh_zone_stat_thresholds(); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata vmstat_notifier = + { &vmstat_cpuup_callback, NULL, 0 }; + +int __init setup_vmstat(void) +{ + refresh_zone_stat_thresholds(); + register_cpu_notifier(&vmstat_notifier); + return 0; +} +module_init(setup_vmstat) +#endif -- cgit v1.2.3 From 0b1d647a02c5a1b67d45287eeb6cb3b2219c41c3 Mon Sep 17 00:00:00 2001 From: Pavel Mironchik Date: Thu, 31 Aug 2006 21:27:47 -0700 Subject: [PATCH] dm: work around mempool_alloc, bio_alloc_bioset deadlocks This patch works around a complex dm-related deadlock/livelock down in the mempool allocator. Alasdair said: Several dm targets suffer from this. Mempools are not yet used correctly everywhere in device-mapper: they can get shared when devices are stacked, and some targets share them across multiple instances. I made fixing this one of the prerequisites for this patch: md-dm-reduce-stack-usage-with-stacked-block-devices.patch which in some cases makes people more likely to hit the problem. There's been some progress on this recently with (unfinished) dm-crypt patches at: http://www.kernel.org/pub/linux/kernel/people/agk/patches/2.6/editing/ (dm-crypt-move-io-to-workqueue.patch plus dependencies) and: I've no problems with a temporary workaround like that, but Milan Broz (a new Redhat developer in the Czech Republic) has started reviewing all the mempool usage in device-mapper so I'm expecting we'll soon have a proper fix for this associated problems. [He's back from holiday at the start of next week.] For now, this sad-but-safe little patch will allow the machine to recover. [akpm@osdl.org: rewrote changelog] Cc: Alasdair G Kergon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempool.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mempool.c b/mm/mempool.c index fe6e05289cc..ccd8cb8cd41 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -238,8 +238,13 @@ repeat_alloc: init_wait(&wait); prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); smp_mb(); - if (!pool->curr_nr) - io_schedule(); + if (!pool->curr_nr) { + /* + * FIXME: this should be io_schedule(). The timeout is there + * as a workaround for some DM problems in 2.6.18. + */ + io_schedule_timeout(5*HZ); + } finish_wait(&pool->wait, &wait); goto repeat_alloc; -- cgit v1.2.3 From 3b98b087fc2daab67518d2baa8aef19a6ad82723 Mon Sep 17 00:00:00 2001 From: Nishanth Aravamudan Date: Thu, 31 Aug 2006 21:27:53 -0700 Subject: [PATCH] fix NUMA interleaving for huge pages Since vma->vm_pgoff is in units of smallpages, VMAs for huge pages have the lower HPAGE_SHIFT - PAGE_SHIFT bits always cleared, which results in badd offsets to the interleave functions. Take this difference from small pages into account when calculating the offset. This does add a 0-bit shift into the small-page path (via alloc_page_vma()), but I think that is negligible. Also add a BUG_ON to prevent the offset from growing due to a negative right-shift, which probably shouldn't be allowed anyways. Tested on an 8-memory node ppc64 NUMA box and got the interleaving I expected. Signed-off-by: Nishanth Aravamudan Signed-off-by: Adam Litke Cc: Andi Kleen Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e07e27e846a..a9963ceddd6 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1176,7 +1176,15 @@ static inline unsigned interleave_nid(struct mempolicy *pol, if (vma) { unsigned long off; - off = vma->vm_pgoff; + /* + * for small pages, there is no difference between + * shift and PAGE_SHIFT, so the bit-shift is safe. + * for huge pages, since vm_pgoff is in units of small + * pages, we need to shift off the always 0 bits to get + * a useful offset. + */ + BUG_ON(shift < PAGE_SHIFT); + off = vma->vm_pgoff >> (shift - PAGE_SHIFT); off += (addr - vma->vm_start) >> shift; return offset_il_node(pol, vma, off); } else -- cgit v1.2.3 From 3a459756810912d2c2bf188cef566af255936b4d Mon Sep 17 00:00:00 2001 From: Kirill Korotaev Date: Thu, 7 Sep 2006 14:17:04 +0400 Subject: [PATCH] IA64,sparc: local DoS with corrupted ELFs This prevents cross-region mappings on IA64 and SPARC which could lead to system crash. They were correctly trapped for normal mmap() calls, but not for the kernel internal calls generated by executable loading. This code just moves the architecture-specific cross-region checks into an arch-specific "arch_mmap_check()" macro, and defines that for the architectures that needed it (ia64, sparc and sparc64). Architectures that don't have any special requirements can just ignore the new cross-region check, since the mmap() code will just notice on its own when the macro isn't defined. Signed-off-by: Pavel Emelianov Signed-off-by: Kirill Korotaev Acked-by: David Miller Signed-off-by: Greg Kroah-Hartman [ Cleaned up to not affect architectures that don't need it ] Signed-off-by: Linus Torvalds --- mm/mmap.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index c1868ecdbc5..e66a0b524af 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -30,6 +30,10 @@ #include #include +#ifndef arch_mmap_check +#define arch_mmap_check(addr, len, flags) (0) +#endif + static void unmap_region(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long start, unsigned long end); @@ -913,6 +917,10 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, if (!len) return -EINVAL; + error = arch_mmap_check(addr, len, flags); + if (error) + return error; + /* Careful about overflows.. */ len = PAGE_ALIGN(len); if (!len || len > TASK_SIZE) @@ -1859,6 +1867,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len) unsigned long flags; struct rb_node ** rb_link, * rb_parent; pgoff_t pgoff = addr >> PAGE_SHIFT; + int error; len = PAGE_ALIGN(len); if (!len) @@ -1867,6 +1876,12 @@ unsigned long do_brk(unsigned long addr, unsigned long len) if ((addr + len) > TASK_SIZE || (addr + len) < addr) return -EINVAL; + flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; + + error = arch_mmap_check(addr, len, flags); + if (error) + return error; + /* * mlock MCL_FUTURE? */ @@ -1907,8 +1922,6 @@ unsigned long do_brk(unsigned long addr, unsigned long len) if (security_vm_enough_memory(len >> PAGE_SHIFT)) return -ENOMEM; - flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; - /* Can we just expand an old private anonymous mapping? */ if (vma_merge(mm, prev, addr, addr + len, flags, NULL, NULL, pgoff, NULL)) -- cgit v1.2.3 From 016eb4a0ed06a3677d67a584da901f0e9a63c666 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 8 Sep 2006 09:48:38 -0700 Subject: [PATCH] invalidate_complete_page() race fix If a CPU faults this page into pagetables after invalidate_mapping_pages() checked page_mapped(), invalidate_complete_page() will still proceed to remove the page from pagecache. This leaves the page-faulting process with a detached page. If it was MAP_SHARED then file data loss will ensue. Fix that up by checking the page's refcount after taking tree_lock. Cc: Nick Piggin Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/truncate.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/truncate.c b/mm/truncate.c index cf1b015df4a..c6ab55ec688 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -68,10 +68,10 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) return 0; write_lock_irq(&mapping->tree_lock); - if (PageDirty(page)) { - write_unlock_irq(&mapping->tree_lock); - return 0; - } + if (PageDirty(page)) + goto failed; + if (page_count(page) != 2) /* caller's ref + pagecache ref */ + goto failed; BUG_ON(PagePrivate(page)); __remove_from_page_cache(page); @@ -79,6 +79,9 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) ClearPageUptodate(page); page_cache_release(page); /* pagecache ref */ return 1; +failed: + write_unlock_irq(&mapping->tree_lock); + return 0; } /** -- cgit v1.2.3