diff options
author | James Bottomley <jejb@mulgrave.il.steeleye.com> | 2006-03-14 14:18:01 -0600 |
---|---|---|
committer | James Bottomley <jejb@mulgrave.il.steeleye.com> | 2006-03-14 14:18:01 -0600 |
commit | f33b5d783b4f56be5ace6a1c98fb5f76b2d2d07d (patch) | |
tree | b027b5f3429d416b3da5b9195024007dab062a5e /mm | |
parent | e935d5da8e5d12fabe5b632736c50eae0427e8c8 (diff) | |
parent | 67963132638e67ad3c5aa16765e6f3f2f3cdd85c (diff) |
Merge ../linux-2.6
Diffstat (limited to 'mm')
-rw-r--r-- | mm/memory_hotplug.c | 1 | ||||
-rw-r--r-- | mm/mempolicy.c | 132 | ||||
-rw-r--r-- | mm/nommu.c | 8 | ||||
-rw-r--r-- | mm/oom_kill.c | 5 | ||||
-rw-r--r-- | mm/page_alloc.c | 17 | ||||
-rw-r--r-- | mm/rmap.c | 21 | ||||
-rw-r--r-- | mm/slab.c | 122 | ||||
-rw-r--r-- | mm/swap.c | 25 | ||||
-rw-r--r-- | mm/vmscan.c | 3 |
9 files changed, 255 insertions, 79 deletions
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a918f77f02f..1fe76d963ac 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -130,6 +130,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) onlined_pages++; } zone->present_pages += onlined_pages; + zone->zone_pgdat->node_present_pages += onlined_pages; setup_per_zone_pages_min(); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 67af4cea1e2..954981b1430 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -197,7 +197,7 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) return policy; } -static void gather_stats(struct page *, void *); +static void gather_stats(struct page *, void *, int pte_dirty); static void migrate_page_add(struct page *page, struct list_head *pagelist, unsigned long flags); @@ -239,7 +239,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, continue; if (flags & MPOL_MF_STATS) - gather_stats(page, private); + gather_stats(page, private, pte_dirty(*pte)); else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) migrate_page_add(page, private, flags); else @@ -954,7 +954,8 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, goto out; } - err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE); + err = do_migrate_pages(mm, &old, &new, + capable(CAP_SYS_ADMIN) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); out: mmput(mm); return err; @@ -1752,66 +1753,145 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) struct numa_maps { unsigned long pages; unsigned long anon; - unsigned long mapped; + unsigned long active; + unsigned long writeback; unsigned long mapcount_max; + unsigned long dirty; + unsigned long swapcache; unsigned long node[MAX_NUMNODES]; }; -static void gather_stats(struct page *page, void *private) +static void gather_stats(struct page *page, void *private, int pte_dirty) { struct numa_maps *md = private; int count = page_mapcount(page); - if (count) - md->mapped++; + md->pages++; + if (pte_dirty || PageDirty(page)) + md->dirty++; - if (count > md->mapcount_max) - md->mapcount_max = count; + if (PageSwapCache(page)) + md->swapcache++; - md->pages++; + if (PageActive(page)) + md->active++; + + if (PageWriteback(page)) + md->writeback++; if (PageAnon(page)) md->anon++; + if (count > md->mapcount_max) + md->mapcount_max = count; + md->node[page_to_nid(page)]++; cond_resched(); } +#ifdef CONFIG_HUGETLB_PAGE +static void check_huge_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct numa_maps *md) +{ + unsigned long addr; + struct page *page; + + for (addr = start; addr < end; addr += HPAGE_SIZE) { + pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK); + pte_t pte; + + if (!ptep) + continue; + + pte = *ptep; + if (pte_none(pte)) + continue; + + page = pte_page(pte); + if (!page) + continue; + + gather_stats(page, md, pte_dirty(*ptep)); + } +} +#else +static inline void check_huge_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct numa_maps *md) +{ +} +#endif + int show_numa_map(struct seq_file *m, void *v) { struct task_struct *task = m->private; struct vm_area_struct *vma = v; struct numa_maps *md; + struct file *file = vma->vm_file; + struct mm_struct *mm = vma->vm_mm; int n; char buffer[50]; - if (!vma->vm_mm) + if (!mm) return 0; md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL); if (!md) return 0; - check_pgd_range(vma, vma->vm_start, vma->vm_end, - &node_online_map, MPOL_MF_STATS, md); + mpol_to_str(buffer, sizeof(buffer), + get_vma_policy(task, vma, vma->vm_start)); - if (md->pages) { - mpol_to_str(buffer, sizeof(buffer), - get_vma_policy(task, vma, vma->vm_start)); + seq_printf(m, "%08lx %s", vma->vm_start, buffer); - seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu", - vma->vm_start, buffer, md->pages, - md->mapped, md->mapcount_max); + if (file) { + seq_printf(m, " file="); + seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= "); + } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { + seq_printf(m, " heap"); + } else if (vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack) { + seq_printf(m, " stack"); + } - if (md->anon) - seq_printf(m," anon=%lu",md->anon); + if (is_vm_hugetlb_page(vma)) { + check_huge_range(vma, vma->vm_start, vma->vm_end, md); + seq_printf(m, " huge"); + } else { + check_pgd_range(vma, vma->vm_start, vma->vm_end, + &node_online_map, MPOL_MF_STATS, md); + } - for_each_online_node(n) - if (md->node[n]) - seq_printf(m, " N%d=%lu", n, md->node[n]); + if (!md->pages) + goto out; - seq_putc(m, '\n'); - } + if (md->anon) + seq_printf(m," anon=%lu",md->anon); + + if (md->dirty) + seq_printf(m," dirty=%lu",md->dirty); + + if (md->pages != md->anon && md->pages != md->dirty) + seq_printf(m, " mapped=%lu", md->pages); + + if (md->mapcount_max > 1) + seq_printf(m, " mapmax=%lu", md->mapcount_max); + + if (md->swapcache) + seq_printf(m," swapcache=%lu", md->swapcache); + + if (md->active < md->pages && !is_vm_hugetlb_page(vma)) + seq_printf(m," active=%lu", md->active); + + if (md->writeback) + seq_printf(m," writeback=%lu", md->writeback); + + for_each_online_node(n) + if (md->node[n]) + seq_printf(m, " N%d=%lu", n, md->node[n]); +out: + seq_putc(m, '\n'); kfree(md); if (m->count < m->size) diff --git a/mm/nommu.c b/mm/nommu.c index 99d21020ec9..4951f4786f2 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -53,7 +53,6 @@ DECLARE_RWSEM(nommu_vma_sem); struct vm_operations_struct generic_file_vm_ops = { }; -EXPORT_SYMBOL(vmalloc); EXPORT_SYMBOL(vfree); EXPORT_SYMBOL(vmalloc_to_page); EXPORT_SYMBOL(vmalloc_32); @@ -205,6 +204,13 @@ void *vmalloc(unsigned long size) { return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); } +EXPORT_SYMBOL(vmalloc); + +void *vmalloc_node(unsigned long size, int node) +{ + return vmalloc(size); +} +EXPORT_SYMBOL(vmalloc_node); /* * vmalloc_32 - allocate virtually continguos memory (32bit addressable) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 8123fad5a48..78747afad6b 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -302,7 +302,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) { struct mm_struct *mm = NULL; task_t *p; - unsigned long points; + unsigned long points = 0; if (printk_ratelimit()) { printk("oom-killer: gfp_mask=0x%x, order=%d\n", @@ -355,6 +355,7 @@ retry: } out: + read_unlock(&tasklist_lock); cpuset_unlock(); if (mm) mmput(mm); @@ -364,5 +365,5 @@ out: * retry to allocate memory unless "p" is current */ if (!test_thread_flag(TIF_MEMDIE)) - schedule_timeout_interruptible(1); + schedule_timeout_uninterruptible(1); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 791690d7d3f..234bd4895d1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -590,21 +590,20 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, } #ifdef CONFIG_NUMA -/* Called from the slab reaper to drain remote pagesets */ -void drain_remote_pages(void) +/* + * Called from the slab reaper to drain pagesets on a particular node that + * belong to the currently executing processor. + */ +void drain_node_pages(int nodeid) { - struct zone *zone; - int i; + int i, z; unsigned long flags; local_irq_save(flags); - for_each_zone(zone) { + for (z = 0; z < MAX_NR_ZONES; z++) { + struct zone *zone = NODE_DATA(nodeid)->node_zones + z; struct per_cpu_pageset *pset; - /* Do not drain local pagesets */ - if (zone->zone_pgdat->node_id == numa_node_id()) - continue; - pset = zone_pcp(zone, smp_processor_id()); for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { struct per_cpu_pages *pcp; diff --git a/mm/rmap.c b/mm/rmap.c index df2c41c2a9a..67f0e20b101 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -212,25 +212,33 @@ out: * through real pte's pointing to valid pages and then releasing * the page from the swap cache. * - * Must hold page lock on page. + * Must hold page lock on page and mmap_sem of one vma that contains + * the page. */ void remove_from_swap(struct page *page) { struct anon_vma *anon_vma; struct vm_area_struct *vma; + unsigned long mapping; - if (!PageAnon(page) || !PageSwapCache(page)) + if (!PageSwapCache(page)) return; - anon_vma = page_lock_anon_vma(page); - if (!anon_vma) + mapping = (unsigned long)page->mapping; + + if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0) return; + /* + * We hold the mmap_sem lock. So no need to call page_lock_anon_vma. + */ + anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON); + spin_lock(&anon_vma->lock); + list_for_each_entry(vma, &anon_vma->head, anon_vma_node) remove_vma_swap(vma, page); spin_unlock(&anon_vma->lock); - delete_from_swap_cache(page); } EXPORT_SYMBOL(remove_from_swap); @@ -529,9 +537,6 @@ void page_add_new_anon_rmap(struct page *page, */ void page_add_file_rmap(struct page *page) { - BUG_ON(PageAnon(page)); - BUG_ON(!pfn_valid(page_to_pfn(page))); - if (atomic_inc_and_test(&page->_mapcount)) __inc_page_state(nr_mapped); } diff --git a/mm/slab.c b/mm/slab.c index add05d808a4..d0bd7f07ab0 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -789,6 +789,47 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, char * dump_stack(); } +#ifdef CONFIG_NUMA +/* + * Special reaping functions for NUMA systems called from cache_reap(). + * These take care of doing round robin flushing of alien caches (containing + * objects freed on different nodes from which they were allocated) and the + * flushing of remote pcps by calling drain_node_pages. + */ +static DEFINE_PER_CPU(unsigned long, reap_node); + +static void init_reap_node(int cpu) +{ + int node; + + node = next_node(cpu_to_node(cpu), node_online_map); + if (node == MAX_NUMNODES) + node = 0; + + __get_cpu_var(reap_node) = node; +} + +static void next_reap_node(void) +{ + int node = __get_cpu_var(reap_node); + + /* + * Also drain per cpu pages on remote zones + */ + if (node != numa_node_id()) + drain_node_pages(node); + + node = next_node(node, node_online_map); + if (unlikely(node >= MAX_NUMNODES)) + node = first_node(node_online_map); + __get_cpu_var(reap_node) = node; +} + +#else +#define init_reap_node(cpu) do { } while (0) +#define next_reap_node(void) do { } while (0) +#endif + /* * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz * via the workqueue/eventd. @@ -806,6 +847,7 @@ static void __devinit start_cpu_timer(int cpu) * at that time. */ if (keventd_up() && reap_work->func == NULL) { + init_reap_node(cpu); INIT_WORK(reap_work, cache_reap, NULL); schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu); } @@ -884,6 +926,23 @@ static void __drain_alien_cache(struct kmem_cache *cachep, } } +/* + * Called from cache_reap() to regularly drain alien caches round robin. + */ +static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) +{ + int node = __get_cpu_var(reap_node); + + if (l3->alien) { + struct array_cache *ac = l3->alien[node]; + if (ac && ac->avail) { + spin_lock_irq(&ac->lock); + __drain_alien_cache(cachep, ac, node); + spin_unlock_irq(&ac->lock); + } + } +} + static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien) { int i = 0; @@ -902,6 +961,7 @@ static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **al #else #define drain_alien_cache(cachep, alien) do { } while (0) +#define reap_alien(cachep, l3) do { } while (0) static inline struct array_cache **alloc_alien_cache(int node, int limit) { @@ -1124,6 +1184,7 @@ void __init kmem_cache_init(void) struct cache_sizes *sizes; struct cache_names *names; int i; + int order; for (i = 0; i < NUM_INIT_LISTS; i++) { kmem_list3_init(&initkmem_list3[i]); @@ -1167,11 +1228,15 @@ void __init kmem_cache_init(void) cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size()); - cache_estimate(0, cache_cache.buffer_size, cache_line_size(), 0, - &left_over, &cache_cache.num); + for (order = 0; order < MAX_ORDER; order++) { + cache_estimate(order, cache_cache.buffer_size, + cache_line_size(), 0, &left_over, &cache_cache.num); + if (cache_cache.num) + break; + } if (!cache_cache.num) BUG(); - + cache_cache.gfporder = order; cache_cache.colour = left_over / cache_cache.colour_off; cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) + sizeof(struct slab), cache_line_size()); @@ -1628,36 +1693,44 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep, size_t size, size_t align, unsigned long flags) { size_t left_over = 0; + int gfporder; - for (;; cachep->gfporder++) { + for (gfporder = 0 ; gfporder <= MAX_GFP_ORDER; gfporder++) { unsigned int num; size_t remainder; - if (cachep->gfporder > MAX_GFP_ORDER) { - cachep->num = 0; - break; - } - - cache_estimate(cachep->gfporder, size, align, flags, - &remainder, &num); + cache_estimate(gfporder, size, align, flags, &remainder, &num); if (!num) continue; + /* More than offslab_limit objects will cause problems */ - if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit) + if ((flags & CFLGS_OFF_SLAB) && num > offslab_limit) break; + /* Found something acceptable - save it away */ cachep->num = num; + cachep->gfporder = gfporder; left_over = remainder; /* + * A VFS-reclaimable slab tends to have most allocations + * as GFP_NOFS and we really don't want to have to be allocating + * higher-order pages when we are unable to shrink dcache. + */ + if (flags & SLAB_RECLAIM_ACCOUNT) + break; + + /* * Large number of objects is good, but very large slabs are * currently bad for the gfp()s. */ - if (cachep->gfporder >= slab_break_gfp_order) + if (gfporder >= slab_break_gfp_order) break; - if ((left_over * 8) <= (PAGE_SIZE << cachep->gfporder)) - /* Acceptable internal fragmentation */ + /* + * Acceptable internal fragmentation? + */ + if ((left_over * 8) <= (PAGE_SIZE << gfporder)) break; } return left_over; @@ -1869,17 +1942,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, size = ALIGN(size, align); - if ((flags & SLAB_RECLAIM_ACCOUNT) && size <= PAGE_SIZE) { - /* - * A VFS-reclaimable slab tends to have most allocations - * as GFP_NOFS and we really don't want to have to be allocating - * higher-order pages when we are unable to shrink dcache. - */ - cachep->gfporder = 0; - cache_estimate(cachep->gfporder, size, align, flags, - &left_over, &cachep->num); - } else - left_over = calculate_slab_order(cachep, size, align, flags); + left_over = calculate_slab_order(cachep, size, align, flags); if (!cachep->num) { printk("kmem_cache_create: couldn't create cache %s.\n", name); @@ -2554,7 +2617,7 @@ static void check_slabp(struct kmem_cache *cachep, struct slab *slabp) "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n", cachep->name, cachep->num, slabp, slabp->inuse); for (i = 0; - i < sizeof(slabp) + cachep->num * sizeof(kmem_bufctl_t); + i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t); i++) { if ((i % 16) == 0) printk("\n%03x:", i); @@ -3494,8 +3557,7 @@ static void cache_reap(void *unused) check_irq_on(); l3 = searchp->nodelists[numa_node_id()]; - if (l3->alien) - drain_alien_cache(searchp, l3->alien); + reap_alien(searchp, l3); spin_lock_irq(&l3->list_lock); drain_array_locked(searchp, cpu_cache_get(searchp), 0, @@ -3545,7 +3607,7 @@ static void cache_reap(void *unused) } check_irq_on(); mutex_unlock(&cache_chain_mutex); - drain_remote_pages(); + next_reap_node(); /* Setup the next iteration */ schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); } diff --git a/mm/swap.c b/mm/swap.c index cce3dda59c5..e9ec06d845e 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -489,13 +489,34 @@ void percpu_counter_mod(struct percpu_counter *fbc, long amount) if (count >= FBC_BATCH || count <= -FBC_BATCH) { spin_lock(&fbc->lock); fbc->count += count; + *pcount = 0; spin_unlock(&fbc->lock); - count = 0; + } else { + *pcount = count; } - *pcount = count; put_cpu(); } EXPORT_SYMBOL(percpu_counter_mod); + +/* + * Add up all the per-cpu counts, return the result. This is a more accurate + * but much slower version of percpu_counter_read_positive() + */ +long percpu_counter_sum(struct percpu_counter *fbc) +{ + long ret; + int cpu; + + spin_lock(&fbc->lock); + ret = fbc->count; + for_each_cpu(cpu) { + long *pcount = per_cpu_ptr(fbc->counters, cpu); + ret += *pcount; + } + spin_unlock(&fbc->lock); + return ret < 0 ? 0 : ret; +} +EXPORT_SYMBOL(percpu_counter_sum); #endif /* diff --git a/mm/vmscan.c b/mm/vmscan.c index b0af7593d01..7ccf763bb30 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1883,7 +1883,8 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) if (!(gfp_mask & __GFP_WAIT) || zone->all_unreclaimable || - atomic_read(&zone->reclaim_in_progress) > 0) + atomic_read(&zone->reclaim_in_progress) > 0 || + (p->flags & PF_MEMALLOC)) return 0; node_id = zone->zone_pgdat->node_id; |