diff options
author | H. Peter Anvin <hpa@zytor.com> | 2009-05-23 16:42:19 -0700 |
---|---|---|
committer | H. Peter Anvin <hpa@zytor.com> | 2009-05-23 16:42:19 -0700 |
commit | ee0736627d3347be0be2769fa7b26431f9726c9d (patch) | |
tree | 203e2204daaec4cf005463fdf2c7bf380d6eef36 /mm | |
parent | cf9972a921470b0a2da7906104bcd540b20e33bf (diff) | |
parent | 0af48f42df15b97080b450d24219dd95db7b929a (diff) |
Merge branch 'x86/urgent' into x86/setup
Resolved conflicts:
arch/x86/boot/memory.c
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 30 | ||||
-rw-r--r-- | mm/filemap.c | 5 | ||||
-rw-r--r-- | mm/memcontrol.c | 40 | ||||
-rw-r--r-- | mm/memory.c | 112 | ||||
-rw-r--r-- | mm/mmap.c | 14 | ||||
-rw-r--r-- | mm/nommu.c | 17 | ||||
-rw-r--r-- | mm/oom_kill.c | 44 | ||||
-rw-r--r-- | mm/page-writeback.c | 6 | ||||
-rw-r--r-- | mm/page_alloc.c | 20 | ||||
-rw-r--r-- | mm/pdflush.c | 31 | ||||
-rw-r--r-- | mm/shmem.c | 35 | ||||
-rw-r--r-- | mm/slob.c | 5 | ||||
-rw-r--r-- | mm/slub.c | 6 | ||||
-rw-r--r-- | mm/swap.c | 46 | ||||
-rw-r--r-- | mm/util.c | 16 | ||||
-rw-r--r-- | mm/vmalloc.c | 1 | ||||
-rw-r--r-- | mm/vmscan.c | 19 |
17 files changed, 261 insertions, 186 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index b53427ad30a..c2b57d81e15 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -213,6 +213,8 @@ config UNEVICTABLE_LRU will use one page flag and increase the code size a little, say Y unless you know what you are doing. + See Documentation/vm/unevictable-lru.txt for more information. + config HAVE_MLOCK bool default y if MMU=y @@ -223,3 +225,31 @@ config HAVE_MLOCKED_PAGE_BIT config MMU_NOTIFIER bool + +config NOMMU_INITIAL_TRIM_EXCESS + int "Turn on mmap() excess space trimming before booting" + depends on !MMU + default 1 + help + The NOMMU mmap() frequently needs to allocate large contiguous chunks + of memory on which to store mappings, but it can only ask the system + allocator for chunks in 2^N*PAGE_SIZE amounts - which is frequently + more than it requires. To deal with this, mmap() is able to trim off + the excess and return it to the allocator. + + If trimming is enabled, the excess is trimmed off and returned to the + system allocator, which can cause extra fragmentation, particularly + if there are a lot of transient processes. + + If trimming is disabled, the excess is kept, but not used, which for + long-term mappings means that the space is wasted. + + Trimming can be dynamically controlled through a sysctl option + (/proc/sys/vm/nr_trim_pages) which specifies the minimum number of + excess pages there must be before trimming should occur, or zero if + no trimming is to occur. + + This option specifies the initial value of this option. The default + of 1 says that all excess pages should be trimmed. + + See Documentation/nommu-mmap.txt for more information. diff --git a/mm/filemap.c b/mm/filemap.c index 2e2d38ebda4..379ff0bcbf6 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -441,6 +441,7 @@ int filemap_write_and_wait_range(struct address_space *mapping, } return err; } +EXPORT_SYMBOL(filemap_write_and_wait_range); /** * add_to_page_cache_locked - add a locked page to the pagecache @@ -567,8 +568,8 @@ EXPORT_SYMBOL(wait_on_page_bit); /** * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue - * @page - Page defining the wait queue of interest - * @waiter - Waiter to add to the queue + * @page: Page defining the wait queue of interest + * @waiter: Waiter to add to the queue * * Add an arbitrary @waiter to the wait queue for the nominated @page. */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2fc6d6c4823..01c2d8f1468 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -932,7 +932,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, if (unlikely(!mem)) return 0; - VM_BUG_ON(mem_cgroup_is_obsolete(mem)); + VM_BUG_ON(!mem || mem_cgroup_is_obsolete(mem)); while (1) { int ret; @@ -1024,9 +1024,7 @@ static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) return NULL; pc = lookup_page_cgroup(page); - /* - * Used bit of swapcache is solid under page lock. - */ + lock_page_cgroup(pc); if (PageCgroupUsed(pc)) { mem = pc->mem_cgroup; if (mem && !css_tryget(&mem->css)) @@ -1040,6 +1038,7 @@ static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) mem = NULL; rcu_read_unlock(); } + unlock_page_cgroup(pc); return mem; } @@ -1618,37 +1617,28 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, } /* - * A call to try to shrink memory usage under specified resource controller. - * This is typically used for page reclaiming for shmem for reducing side - * effect of page allocation from shmem, which is used by some mem_cgroup. + * A call to try to shrink memory usage on charge failure at shmem's swapin. + * Calling hierarchical_reclaim is not enough because we should update + * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM. + * Moreover considering hierarchy, we should reclaim from the mem_over_limit, + * not from the memcg which this page would be charged to. + * try_charge_swapin does all of these works properly. */ -int mem_cgroup_shrink_usage(struct page *page, +int mem_cgroup_shmem_charge_fallback(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { struct mem_cgroup *mem = NULL; - int progress = 0; - int retry = MEM_CGROUP_RECLAIM_RETRIES; + int ret; if (mem_cgroup_disabled()) return 0; - if (page) - mem = try_get_mem_cgroup_from_swapcache(page); - if (!mem && mm) - mem = try_get_mem_cgroup_from_mm(mm); - if (unlikely(!mem)) - return 0; - do { - progress = mem_cgroup_hierarchical_reclaim(mem, - gfp_mask, true, false); - progress += mem_cgroup_check_under_limit(mem); - } while (!progress && --retry); + ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); + if (!ret) + mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */ - css_put(&mem->css); - if (!retry) - return -ENOMEM; - return 0; + return ret; } static DEFINE_MUTEX(set_limit_mutex); diff --git a/mm/memory.c b/mm/memory.c index cf6873e91c6..4126dd16778 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1971,6 +1971,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, ret = tmp; goto unwritable_page; } + if (unlikely(!(tmp & VM_FAULT_LOCKED))) { + lock_page(old_page); + if (!old_page->mapping) { + ret = 0; /* retry the fault */ + unlock_page(old_page); + goto unwritable_page; + } + } else + VM_BUG_ON(!PageLocked(old_page)); /* * Since we dropped the lock we need to revalidate @@ -1980,9 +1989,11 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, */ page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - page_cache_release(old_page); - if (!pte_same(*page_table, orig_pte)) + if (!pte_same(*page_table, orig_pte)) { + unlock_page(old_page); + page_cache_release(old_page); goto unlock; + } page_mkwrite = 1; } @@ -2094,9 +2105,6 @@ gotten: unlock: pte_unmap_unlock(page_table, ptl); if (dirty_page) { - if (vma->vm_file) - file_update_time(vma->vm_file); - /* * Yes, Virginia, this is actually required to prevent a race * with clear_page_dirty_for_io() from clearing the page dirty @@ -2105,16 +2113,41 @@ unlock: * * do_no_page is protected similarly. */ - wait_on_page_locked(dirty_page); - set_page_dirty_balance(dirty_page, page_mkwrite); + if (!page_mkwrite) { + wait_on_page_locked(dirty_page); + set_page_dirty_balance(dirty_page, page_mkwrite); + } put_page(dirty_page); + if (page_mkwrite) { + struct address_space *mapping = dirty_page->mapping; + + set_page_dirty(dirty_page); + unlock_page(dirty_page); + page_cache_release(dirty_page); + if (mapping) { + /* + * Some device drivers do not set page.mapping + * but still dirty their pages + */ + balance_dirty_pages_ratelimited(mapping); + } + } + + /* file_update_time outside page_lock */ + if (vma->vm_file) + file_update_time(vma->vm_file); } return ret; oom_free_new: page_cache_release(new_page); oom: - if (old_page) + if (old_page) { + if (page_mkwrite) { + unlock_page(old_page); + page_cache_release(old_page); + } page_cache_release(old_page); + } return VM_FAULT_OOM; unwritable_page: @@ -2458,8 +2491,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { ret = VM_FAULT_OOM; - unlock_page(page); - goto out; + goto out_page; } /* @@ -2521,6 +2553,7 @@ out: out_nomap: mem_cgroup_cancel_charge_swapin(ptr); pte_unmap_unlock(page_table, ptl); +out_page: unlock_page(page); page_cache_release(page); return ret; @@ -2664,27 +2697,22 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, int tmp; unlock_page(page); - vmf.flags |= FAULT_FLAG_MKWRITE; + vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; tmp = vma->vm_ops->page_mkwrite(vma, &vmf); if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { ret = tmp; - anon = 1; /* no anon but release vmf.page */ - goto out_unlocked; - } - lock_page(page); - /* - * XXX: this is not quite right (racy vs - * invalidate) to unlock and relock the page - * like this, however a better fix requires - * reworking page_mkwrite locking API, which - * is better done later. - */ - if (!page->mapping) { - ret = 0; - anon = 1; /* no anon but release vmf.page */ - goto out; + goto unwritable_page; } + if (unlikely(!(tmp & VM_FAULT_LOCKED))) { + lock_page(page); + if (!page->mapping) { + ret = 0; /* retry the fault */ + unlock_page(page); + goto unwritable_page; + } + } else + VM_BUG_ON(!PageLocked(page)); page_mkwrite = 1; } } @@ -2736,19 +2764,35 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_unmap_unlock(page_table, ptl); out: - unlock_page(vmf.page); -out_unlocked: - if (anon) - page_cache_release(vmf.page); - else if (dirty_page) { - if (vma->vm_file) - file_update_time(vma->vm_file); + if (dirty_page) { + struct address_space *mapping = page->mapping; - set_page_dirty_balance(dirty_page, page_mkwrite); + if (set_page_dirty(dirty_page)) + page_mkwrite = 1; + unlock_page(dirty_page); put_page(dirty_page); + if (page_mkwrite && mapping) { + /* + * Some device drivers do not set page.mapping but still + * dirty their pages + */ + balance_dirty_pages_ratelimited(mapping); + } + + /* file_update_time outside page_lock */ + if (vma->vm_file) + file_update_time(vma->vm_file); + } else { + unlock_page(vmf.page); + if (anon) + page_cache_release(vmf.page); } return ret; + +unwritable_page: + page_cache_release(page); + return ret; } static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, diff --git a/mm/mmap.c b/mm/mmap.c index 4a3841186c1..6b7b1a95944 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -85,7 +85,7 @@ EXPORT_SYMBOL(vm_get_page_prot); int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ int sysctl_overcommit_ratio = 50; /* default is 50% */ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; -atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0); +struct percpu_counter vm_committed_as; /* * Check that a process has enough memory to allocate a new virtual @@ -179,11 +179,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) if (mm) allowed -= mm->total_vm / 32; - /* - * cast `allowed' as a signed long because vm_committed_space - * sometimes has a negative value - */ - if (atomic_long_read(&vm_committed_space) < (long)allowed) + if (percpu_counter_read_positive(&vm_committed_as) < allowed) return 0; error: vm_unacct_memory(pages); @@ -1575,7 +1571,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns * Overcommit.. This must be the final test, as it will * update security statistics. */ - if (security_vm_enough_memory(grow)) + if (security_vm_enough_memory_mm(mm, grow)) return -ENOMEM; /* Ok, everything looks good - let it rip */ @@ -2481,4 +2477,8 @@ void mm_drop_all_locks(struct mm_struct *mm) */ void __init mmap_init(void) { + int ret; + + ret = percpu_counter_init(&vm_committed_as, 0); + VM_BUG_ON(ret); } diff --git a/mm/nommu.c b/mm/nommu.c index 72eda4aee2c..b571ef70742 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -62,11 +62,11 @@ void *high_memory; struct page *mem_map; unsigned long max_mapnr; unsigned long num_physpages; -atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0); +struct percpu_counter vm_committed_as; int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ int sysctl_overcommit_ratio = 50; /* default is 50% */ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; -int sysctl_nr_trim_pages = 1; /* page trimming behaviour */ +int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; int heap_stack_gap = 0; atomic_long_t mmap_pages_allocated; @@ -463,6 +463,10 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) */ void __init mmap_init(void) { + int ret; + + ret = percpu_counter_init(&vm_committed_as, 0); + VM_BUG_ON(ret); vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC); } @@ -511,8 +515,6 @@ static void add_nommu_region(struct vm_region *region) validate_nommu_regions(); - BUG_ON(region->vm_start & ~PAGE_MASK); - parent = NULL; p = &nommu_region_tree.rb_node; while (*p) { @@ -1847,12 +1849,9 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) if (mm) allowed -= mm->total_vm / 32; - /* - * cast `allowed' as a signed long because vm_committed_space - * sometimes has a negative value - */ - if (atomic_long_read(&vm_committed_space) < (long)allowed) + if (percpu_counter_read_positive(&vm_committed_as) < allowed) return 0; + error: vm_unacct_memory(pages); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 2f3166e308d..92bcf1db16b 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -514,34 +514,32 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) */ static void __out_of_memory(gfp_t gfp_mask, int order) { - if (sysctl_oom_kill_allocating_task) { - oom_kill_process(current, gfp_mask, order, 0, NULL, - "Out of memory (oom_kill_allocating_task)"); - - } else { - unsigned long points; - struct task_struct *p; - -retry: - /* - * Rambo mode: Shoot down a process and hope it solves whatever - * issues we may have. - */ - p = select_bad_process(&points, NULL); + struct task_struct *p; + unsigned long points; - if (PTR_ERR(p) == -1UL) + if (sysctl_oom_kill_allocating_task) + if (!oom_kill_process(current, gfp_mask, order, 0, NULL, + "Out of memory (oom_kill_allocating_task)")) return; +retry: + /* + * Rambo mode: Shoot down a process and hope it solves whatever + * issues we may have. + */ + p = select_bad_process(&points, NULL); - /* Found nothing?!?! Either we hang forever, or we panic. */ - if (!p) { - read_unlock(&tasklist_lock); - panic("Out of memory and no killable processes...\n"); - } + if (PTR_ERR(p) == -1UL) + return; - if (oom_kill_process(p, gfp_mask, order, points, NULL, - "Out of memory")) - goto retry; + /* Found nothing?!?! Either we hang forever, or we panic. */ + if (!p) { + read_unlock(&tasklist_lock); + panic("Out of memory and no killable processes...\n"); } + + if (oom_kill_process(p, gfp_mask, order, points, NULL, + "Out of memory")) + goto retry; } /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 30351f0063a..bb553c3e955 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -94,12 +94,12 @@ unsigned long vm_dirty_bytes; /* * The interval between `kupdate'-style writebacks */ -unsigned int dirty_writeback_interval = 5 * 100; /* sentiseconds */ +unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */ /* * The longest time for which data is allowed to remain dirty */ -unsigned int dirty_expire_interval = 30 * 100; /* sentiseconds */ +unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */ /* * Flag that makes the machine dump writes/reads and block dirtyings. @@ -770,7 +770,7 @@ static void wb_kupdate(unsigned long arg) sync_supers(); - oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval); + oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10); start_jif = jiffies; next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10); nr_to_write = global_page_state(NR_FILE_DIRTY) + diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e2f26991fff..fe753ecf2aa 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2681,6 +2681,7 @@ static void __meminit zone_init_free_lists(struct zone *zone) static int zone_batchsize(struct zone *zone) { +#ifdef CONFIG_MMU int batch; /* @@ -2706,9 +2707,26 @@ static int zone_batchsize(struct zone *zone) * of pages of one half of the possible page colors * and the other with pages of the other colors. */ - batch = (1 << (fls(batch + batch/2)-1)) - 1; + batch = rounddown_pow_of_two(batch + batch/2) - 1; return batch; + +#else + /* The deferral and batching of frees should be suppressed under NOMMU + * conditions. + * + * The problem is that NOMMU needs to be able to allocate large chunks + * of contiguous memory as there's no hardware page translation to + * assemble apparent contiguous memory from discontiguous pages. + * + * Queueing large contiguous runs of pages for batching, however, + * causes the pages to actually be freed in smaller chunks. As there + * can be a significant delay between the individual batches being + * recycled, this leads to the once large chunks of space being + * fragmented and becoming unavailable for high-order allocations. + */ + return 0; +#endif } static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) diff --git a/mm/pdflush.c b/mm/pdflush.c index f2caf96993f..235ac440c44 100644 --- a/mm/pdflush.c +++ b/mm/pdflush.c @@ -58,14 +58,6 @@ static DEFINE_SPINLOCK(pdflush_lock); int nr_pdflush_threads = 0; /* - * The max/min number of pdflush threads. R/W by sysctl at - * /proc/sys/vm/nr_pdflush_threads_max/min - */ -int nr_pdflush_threads_max __read_mostly = MAX_PDFLUSH_THREADS; -int nr_pdflush_threads_min __read_mostly = MIN_PDFLUSH_THREADS; - - -/* * The time at which the pdflush thread pool last went empty */ static unsigned long last_empty_jifs; @@ -76,7 +68,7 @@ static unsigned long last_empty_jifs; * Thread pool management algorithm: * * - The minimum and maximum number of pdflush instances are bound - * by nr_pdflush_threads_min and nr_pdflush_threads_max. + * by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS. * * - If there have been no idle pdflush instances for 1 second, create * a new one. @@ -142,13 +134,14 @@ static int __pdflush(struct pdflush_work *my_work) * To throttle creation, we reset last_empty_jifs. */ if (time_after(jiffies, last_empty_jifs + 1 * HZ)) { - if (list_empty(&pdflush_list) && - nr_pdflush_threads < nr_pdflush_threads_max) { - last_empty_jifs = jiffies; - nr_pdflush_threads++; - spin_unlock_irq(&pdflush_lock); - start_one_pdflush_thread(); - spin_lock_irq(&pdflush_lock); + if (list_empty(&pdflush_list)) { + if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) { + last_empty_jifs = jiffies; + nr_pdflush_threads++; + spin_unlock_irq(&pdflush_lock); + start_one_pdflush_thread(); + spin_lock_irq(&pdflush_lock); + } } } @@ -160,7 +153,7 @@ static int __pdflush(struct pdflush_work *my_work) */ if (list_empty(&pdflush_list)) continue; - if (nr_pdflush_threads <= nr_pdflush_threads_min) + if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS) continue; pdf = list_entry(pdflush_list.prev, struct pdflush_work, list); if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) { @@ -266,9 +259,9 @@ static int __init pdflush_init(void) * Pre-set nr_pdflush_threads... If we fail to create, * the count will be decremented. */ - nr_pdflush_threads = nr_pdflush_threads_min; + nr_pdflush_threads = MIN_PDFLUSH_THREADS; - for (i = 0; i < nr_pdflush_threads_min; i++) + for (i = 0; i < MIN_PDFLUSH_THREADS; i++) start_one_pdflush_thread(); return 0; } diff --git a/mm/shmem.c b/mm/shmem.c index d94d2e9146b..b25f95ce3db 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -24,6 +24,7 @@ #include <linux/init.h> #include <linux/vfs.h> #include <linux/mount.h> +#include <linux/pagemap.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/module.h> @@ -43,7 +44,6 @@ static struct vfsmount *shm_mnt; #include <linux/exportfs.h> #include <linux/generic_acl.h> #include <linux/mman.h> -#include <linux/pagemap.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/backing-dev.h> @@ -65,13 +65,28 @@ static struct vfsmount *shm_mnt; #include <asm/div64.h> #include <asm/pgtable.h> +/* + * The maximum size of a shmem/tmpfs file is limited by the maximum size of + * its triple-indirect swap vector - see illustration at shmem_swp_entry(). + * + * With 4kB page size, maximum file size is just over 2TB on a 32-bit kernel, + * but one eighth of that on a 64-bit kernel. With 8kB page size, maximum + * file size is just over 4TB on a 64-bit kernel, but 16TB on a 32-bit kernel, + * MAX_LFS_FILESIZE being then more restrictive than swap vector layout. + * + * We use / and * instead of shifts in the definitions below, so that the swap + * vector can be tested with small even values (e.g. 20) for ENTRIES_PER_PAGE. + */ #define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long)) -#define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) -#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) +#define ENTRIES_PER_PAGEPAGE ((unsigned long long)ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) -#define SHMEM_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1)) -#define SHMEM_MAX_BYTES ((unsigned long long)SHMEM_MAX_INDEX << PAGE_CACHE_SHIFT) +#define SHMSWP_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1)) +#define SHMSWP_MAX_BYTES (SHMSWP_MAX_INDEX << PAGE_CACHE_SHIFT) +#define SHMEM_MAX_BYTES min_t(unsigned long long, SHMSWP_MAX_BYTES, MAX_LFS_FILESIZE) +#define SHMEM_MAX_INDEX ((unsigned long)((SHMEM_MAX_BYTES+1) >> PAGE_CACHE_SHIFT)) + +#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) #define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) /* info->flags needs VM_flags to handle pagein/truncate races efficiently */ @@ -1325,8 +1340,12 @@ repeat: shmem_swp_unmap(entry); spin_unlock(&info->lock); if (error == -ENOMEM) { - /* allow reclaim from this memory cgroup */ - error = mem_cgroup_shrink_usage(swappage, + /* + * reclaim from proper memory cgroup and + * call memcg's OOM if needed. + */ + error = mem_cgroup_shmem_charge_fallback( + swappage, current->mm, gfp); if (error) { @@ -2581,7 +2600,7 @@ int shmem_unuse(swp_entry_t entry, struct page *page) #define shmem_get_inode(sb, mode, dev, flags) ramfs_get_inode(sb, mode, dev) #define shmem_acct_size(flags, size) 0 #define shmem_unacct_size(flags, size) do {} while (0) -#define SHMEM_MAX_BYTES LLONG_MAX +#define SHMEM_MAX_BYTES MAX_LFS_FILESIZE #endif /* CONFIG_SHMEM */ diff --git a/mm/slob.c b/mm/slob.c index a2d4ab32198..f92e66d558b 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -60,6 +60,7 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/mm.h> +#include <linux/swap.h> /* struct reclaim_state */ #include <linux/cache.h> #include <linux/init.h> #include <linux/module.h> @@ -255,6 +256,8 @@ static void *slob_new_pages(gfp_t gfp, int order, int node) static void slob_free_pages(void *b, int order) { + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += 1 << order; free_pages((unsigned long)b, order); } @@ -407,7 +410,7 @@ static void slob_free(void *block, int size) spin_unlock_irqrestore(&slob_lock, flags); clear_slob_page(sp); free_slob_page(sp); - free_page((unsigned long)b); + slob_free_pages(b, 0); return; } diff --git a/mm/slub.c b/mm/slub.c index 7ab54ecbd3f..65ffda5934b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -9,6 +9,7 @@ */ #include <linux/mm.h> +#include <linux/swap.h> /* struct reclaim_state */ #include <linux/module.h> #include <linux/bit_spinlock.h> #include <linux/interrupt.h> @@ -1170,6 +1171,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __ClearPageSlab(page); reset_page_mapcount(page); + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += pages; __free_pages(page, order); } @@ -1909,7 +1912,7 @@ static inline int calculate_order(int size) * Doh this slab cannot be placed using slub_max_order. */ order = slab_order(size, 1, MAX_ORDER, 1); - if (order <= MAX_ORDER) + if (order < MAX_ORDER) return order; return -ENOSYS; } @@ -2522,6 +2525,7 @@ __setup("slub_min_order=", setup_slub_min_order); static int __init setup_slub_max_order(char *str) { get_option(&str, &slub_max_order); + slub_max_order = min(slub_max_order, MAX_ORDER - 1); return 1; } diff --git a/mm/swap.c b/mm/swap.c index bede23ce64e..cb29ae5d33a 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -491,49 +491,6 @@ unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, EXPORT_SYMBOL(pagevec_lookup_tag); -#ifdef CONFIG_SMP -/* - * We tolerate a little inaccuracy to avoid ping-ponging the counter between - * CPUs - */ -#define ACCT_THRESHOLD max(16, NR_CPUS * 2) - -static DEFINE_PER_CPU(long, committed_space); - -void vm_acct_memory(long pages) -{ - long *local; - - preempt_disable(); - local = &__get_cpu_var(committed_space); - *local += pages; - if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) { - atomic_long_add(*local, &vm_committed_space); - *local = 0; - } - preempt_enable(); -} - -#ifdef CONFIG_HOTPLUG_CPU - -/* Drop the CPU's cached committed space back into the central pool. */ -static int cpu_swap_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - long *committed; - - committed = &per_cpu(committed_space, (long)hcpu); - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { - atomic_long_add(*committed, &vm_committed_space); - *committed = 0; - drain_cpu_pagevecs((long)hcpu); - } - return NOTIFY_OK; -} -#endif /* CONFIG_HOTPLUG_CPU */ -#endif /* CONFIG_SMP */ - /* * Perform any setup for the swap system */ @@ -554,7 +511,4 @@ void __init swap_setup(void) * Right now other parts of the system means that we * _really_ don't want to cluster much more */ -#ifdef CONFIG_HOTPLUG_CPU - hotcpu_notifier(cpu_swap_callback, 0); -#endif } diff --git a/mm/util.c b/mm/util.c index 2599e83eea1..55bef160b9f 100644 --- a/mm/util.c +++ b/mm/util.c @@ -223,6 +223,22 @@ void arch_pick_mmap_layout(struct mm_struct *mm) } #endif +/** + * get_user_pages_fast() - pin user pages in memory + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @write: whether pages will be written to + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. + * + * Attempt to pin user pages in memory without taking mm->mmap_sem. + * If not successful, it will fall back to taking the lock and + * calling get_user_pages(). + * + * Returns number of pages pinned. This may be fewer than the number + * requested. If nr_pages is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. + */ int __attribute__((weak)) get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { diff --git a/mm/vmalloc.c b/mm/vmalloc.c index fab19876b4d..083716ea38c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -402,6 +402,7 @@ overflow: printk(KERN_WARNING "vmap allocation for size %lu failed: " "use vmalloc=<size> to increase size.\n", size); + kfree(va); return ERR_PTR(-EBUSY); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 39fdfb14eea..5fa3eda1f03 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -63,6 +63,9 @@ struct scan_control { /* Can mapped pages be reclaimed? */ int may_unmap; + /* Can pages be swapped as part of reclaim? */ + int may_swap; + /* This context's SWAP_CLUSTER_MAX. If freeing memory for * suspend, we effectively ignore SWAP_CLUSTER_MAX. * In this context, it doesn't matter that we scan the @@ -1380,7 +1383,7 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); /* If we have no swap space, do not bother scanning anon pages. */ - if (nr_swap_pages <= 0) { + if (!sc->may_swap || (nr_swap_pages <= 0)) { percent[0] = 0; percent[1] = 100; return; @@ -1468,7 +1471,7 @@ static void shrink_zone(int priority, struct zone *zone, for_each_evictable_lru(l) { int file = is_file_lru(l); - int scan; + unsigned long scan; scan = zone_nr_pages(zone, sc, l); if (priority) { @@ -1697,6 +1700,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .may_writepage = !laptop_mode, .swap_cluster_max = SWAP_CLUSTER_MAX, .may_unmap = 1, + .may_swap = 1, .swappiness = vm_swappiness, .order = order, .mem_cgroup = NULL, @@ -1717,6 +1721,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, struct scan_control sc = { .may_writepage = !laptop_mode, .may_unmap = 1, + .may_swap = !noswap, .swap_cluster_max = SWAP_CLUSTER_MAX, .swappiness = swappiness, .order = 0, @@ -1726,9 +1731,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, }; struct zonelist *zonelist; - if (noswap) - sc.may_unmap = 0; - sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); zonelist = NODE_DATA(numa_node_id())->node_zonelists; @@ -1767,6 +1769,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) struct scan_control sc = { .gfp_mask = GFP_KERNEL, .may_unmap = 1, + .may_swap = 1, .swap_cluster_max = SWAP_CLUSTER_MAX, .swappiness = vm_swappiness, .order = order, @@ -2088,13 +2091,13 @@ static void shrink_all_zones(unsigned long nr_pages, int prio, nr_reclaimed += shrink_list(l, nr_to_scan, zone, sc, prio); if (nr_reclaimed >= nr_pages) { - sc->nr_reclaimed = nr_reclaimed; + sc->nr_reclaimed += nr_reclaimed; return; } } } } - sc->nr_reclaimed = nr_reclaimed; + sc->nr_reclaimed += nr_reclaimed; } /* @@ -2115,6 +2118,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) .may_unmap = 0, .may_writepage = 1, .isolate_pages = isolate_pages_global, + .nr_reclaimed = 0, }; current->reclaim_state = &reclaim_state; @@ -2297,6 +2301,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) struct scan_control sc = { .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), + .may_swap = 1, .swap_cluster_max = max_t(unsigned long, nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = gfp_mask, |