From a56ed663047fc9927ec8b35750d23ece54f85dc7 Mon Sep 17 00:00:00 2001 From: Peter W Morreale Date: Mon, 6 Apr 2009 19:00:28 -0700 Subject: mm: fix pdflush thread creation upper bound Fix a race on creating pdflush threads. Without the patch, it is possible to create more than MAX_PDFLUSH_THREADS threads, and this has been observed in practice on IO loaded SMP machines. The fix involves moving the lock around to protect the check against the thread count and correctly dealing with thread creation failure. This fix also _mostly_ repairs a race condition on how quickly the threads are created. The original intent was to create a pdflush thread (up to the max allowed) every second. Without this patch is is possible to create NCPUS pdflush threads concurrently. The 'mostly' caveat is because an assumption is made that thread creation will be successful. If we fail to create the thread, the miss is not considered fatal. (we will try again in 1 second) Signed-off-by: Peter W Morreale Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/pdflush.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/pdflush.c b/mm/pdflush.c index 118905e3d78..235ac440c44 100644 --- a/mm/pdflush.c +++ b/mm/pdflush.c @@ -98,7 +98,6 @@ static int __pdflush(struct pdflush_work *my_work) INIT_LIST_HEAD(&my_work->list); spin_lock_irq(&pdflush_lock); - nr_pdflush_threads++; for ( ; ; ) { struct pdflush_work *pdf; @@ -126,20 +125,26 @@ static int __pdflush(struct pdflush_work *my_work) (*my_work->fn)(my_work->arg0); + spin_lock_irq(&pdflush_lock); + /* * Thread creation: For how long have there been zero * available threads? + * + * To throttle creation, we reset last_empty_jifs. */ if (time_after(jiffies, last_empty_jifs + 1 * HZ)) { - /* unlocked list_empty() test is OK here */ if (list_empty(&pdflush_list)) { - /* unlocked test is OK here */ - if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) + if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) { + last_empty_jifs = jiffies; + nr_pdflush_threads++; + spin_unlock_irq(&pdflush_lock); start_one_pdflush_thread(); + spin_lock_irq(&pdflush_lock); + } } } - spin_lock_irq(&pdflush_lock); my_work->fn = NULL; /* @@ -236,13 +241,26 @@ int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0) static void start_one_pdflush_thread(void) { - kthread_run(pdflush, NULL, "pdflush"); + struct task_struct *k; + + k = kthread_run(pdflush, NULL, "pdflush"); + if (unlikely(IS_ERR(k))) { + spin_lock_irq(&pdflush_lock); + nr_pdflush_threads--; + spin_unlock_irq(&pdflush_lock); + } } static int __init pdflush_init(void) { int i; + /* + * Pre-set nr_pdflush_threads... If we fail to create, + * the count will be decremented. + */ + nr_pdflush_threads = MIN_PDFLUSH_THREADS; + for (i = 0; i < MIN_PDFLUSH_THREADS; i++) start_one_pdflush_thread(); return 0; -- cgit v1.2.3 From fafd688e4c0c34da0f3de909881117d374e4c7af Mon Sep 17 00:00:00 2001 From: Peter W Morreale Date: Mon, 6 Apr 2009 19:00:29 -0700 Subject: mm: add /proc controls for pdflush threads Add /proc entries to give the admin the ability to control the minimum and maximum number of pdflush threads. This allows finer control of pdflush on both large and small machines. The rationale is simply one size does not fit all. Admins on large and/or small systems may want to tune the min/max pdflush thread count to best suit their needs. Right now the min/max is hardcoded to 2/8. While probably a fair estimate for smaller machines, large machines with large numbers of CPUs and large numbers of filesystems/block devices may benefit from larger numbers of threads working on different block devices. Even if the background flushing algorithm is radically changed, it is still likely that multiple threads will be involved and admins would still desire finer control on the min/max other than to have to recompile the kernel. The patch adds '/proc/sys/vm/nr_pdflush_threads_min' and '/proc/sys/vm/nr_pdflush_threads_max' with r/w permissions. The minimum value for nr_pdflush_threads_min is 1 and the maximum value is the current value of nr_pdflush_threads_max. This minimum is required since additional thread creation is performed in a pdflush thread itself. The minimum value for nr_pdflush_threads_max is the current value of nr_pdflush_threads_min and the maximum value can be 1000. Documentation/sysctl/vm.txt is also updated. [akpm@linux-foundation.org: fix comment, fix whitespace, use __read_mostly] Signed-off-by: Peter W Morreale Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/pdflush.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/pdflush.c b/mm/pdflush.c index 235ac440c44..f2caf96993f 100644 --- a/mm/pdflush.c +++ b/mm/pdflush.c @@ -57,6 +57,14 @@ static DEFINE_SPINLOCK(pdflush_lock); */ int nr_pdflush_threads = 0; +/* + * The max/min number of pdflush threads. R/W by sysctl at + * /proc/sys/vm/nr_pdflush_threads_max/min + */ +int nr_pdflush_threads_max __read_mostly = MAX_PDFLUSH_THREADS; +int nr_pdflush_threads_min __read_mostly = MIN_PDFLUSH_THREADS; + + /* * The time at which the pdflush thread pool last went empty */ @@ -68,7 +76,7 @@ static unsigned long last_empty_jifs; * Thread pool management algorithm: * * - The minimum and maximum number of pdflush instances are bound - * by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS. + * by nr_pdflush_threads_min and nr_pdflush_threads_max. * * - If there have been no idle pdflush instances for 1 second, create * a new one. @@ -134,14 +142,13 @@ static int __pdflush(struct pdflush_work *my_work) * To throttle creation, we reset last_empty_jifs. */ if (time_after(jiffies, last_empty_jifs + 1 * HZ)) { - if (list_empty(&pdflush_list)) { - if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) { - last_empty_jifs = jiffies; - nr_pdflush_threads++; - spin_unlock_irq(&pdflush_lock); - start_one_pdflush_thread(); - spin_lock_irq(&pdflush_lock); - } + if (list_empty(&pdflush_list) && + nr_pdflush_threads < nr_pdflush_threads_max) { + last_empty_jifs = jiffies; + nr_pdflush_threads++; + spin_unlock_irq(&pdflush_lock); + start_one_pdflush_thread(); + spin_lock_irq(&pdflush_lock); } } @@ -153,7 +160,7 @@ static int __pdflush(struct pdflush_work *my_work) */ if (list_empty(&pdflush_list)) continue; - if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS) + if (nr_pdflush_threads <= nr_pdflush_threads_min) continue; pdf = list_entry(pdflush_list.prev, struct pdflush_work, list); if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) { @@ -259,9 +266,9 @@ static int __init pdflush_init(void) * Pre-set nr_pdflush_threads... If we fail to create, * the count will be decremented. */ - nr_pdflush_threads = MIN_PDFLUSH_THREADS; + nr_pdflush_threads = nr_pdflush_threads_min; - for (i = 0; i < MIN_PDFLUSH_THREADS; i++) + for (i = 0; i < nr_pdflush_threads_min; i++) start_one_pdflush_thread(); return 0; } -- cgit v1.2.3 From 697f619fc87aa9bf5b6c8c756f7ea54e950d5cd5 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 13 Apr 2009 14:39:54 -0700 Subject: filemap: fix kernel-doc warnings Fix filemap.c kernel-doc warnings: Warning(mm/filemap.c:575): No description found for parameter 'page' Warning(mm/filemap.c:575): No description found for parameter 'waiter' Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 2e2d38ebda4..8bd498040f3 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -567,8 +567,8 @@ EXPORT_SYMBOL(wait_on_page_bit); /** * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue - * @page - Page defining the wait queue of interest - * @waiter - Waiter to add to the queue + * @page: Page defining the wait queue of interest + * @waiter: Waiter to add to the queue * * Add an arbitrary @waiter to the wait queue for the nominated @page. */ -- cgit v1.2.3 From 5a52edded382c2f436721d5a044ed16c290c5750 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 13 Apr 2009 14:40:01 -0700 Subject: mm: point the UNEVICTABLE_LRU config option at the documentation Point the UNEVICTABLE_LRU config option at the documentation describing the option. Signed-off-by: David Howells Cc: Lee Schermerhorn Cc: Rik van Riel Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index b53427ad30a..57971d2ab84 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -213,6 +213,8 @@ config UNEVICTABLE_LRU will use one page flag and increase the code size a little, say Y unless you know what you are doing. + See Documentation/vm/unevictable-lru.txt for more information. + config HAVE_MLOCK bool default y if MMU=y -- cgit v1.2.3 From 9de100d001564f58c3fb2ec1bd03e540ac0aa357 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Mon, 13 Apr 2009 14:40:05 -0700 Subject: mm: document get_user_pages_fast() While better than get_user_pages(), the usage of gupf(), especially the return values and the fact that it can potentially only partially pin the range, warranted some documentation. Signed-off-by: Andy Grover Cc: Ingo Molnar Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/util.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'mm') diff --git a/mm/util.c b/mm/util.c index 2599e83eea1..55bef160b9f 100644 --- a/mm/util.c +++ b/mm/util.c @@ -223,6 +223,22 @@ void arch_pick_mmap_layout(struct mm_struct *mm) } #endif +/** + * get_user_pages_fast() - pin user pages in memory + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @write: whether pages will be written to + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. + * + * Attempt to pin user pages in memory without taking mm->mmap_sem. + * If not successful, it will fall back to taking the lock and + * calling get_user_pages(). + * + * Returns number of pages pinned. This may be fewer than the number + * requested. If nr_pages is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. + */ int __attribute__((weak)) get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { -- cgit v1.2.3 From a8031cb00e286600ea08bd00a6812dbfec412376 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Mon, 13 Apr 2009 14:40:08 -0700 Subject: memcg: remove warning when CONFIG_DEBUG_VM=n mm/memcontrol.c:318: warning: `mem_cgroup_is_obsolete' defined but not used [akpm@linux-foundation.org: simplify as suggested by Balbir] Signed-off-by: KAMEZAWA Hiroyuki Reviewed-by: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2fc6d6c4823..e44fb0fbb80 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -932,7 +932,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, if (unlikely(!mem)) return 0; - VM_BUG_ON(mem_cgroup_is_obsolete(mem)); + VM_BUG_ON(!mem || mem_cgroup_is_obsolete(mem)); while (1) { int ret; -- cgit v1.2.3 From 61609d01cbb3ab865c8cccaf85e6837c47096480 Mon Sep 17 00:00:00 2001 From: Yuri Tikhonov Date: Mon, 13 Apr 2009 14:40:11 -0700 Subject: shmem: fix division by zero Fix a division by zero which we have in shmem_truncate_range() and shmem_unuse_inode() when using big PAGE_SIZE values (e.g. 256kB on ppc44x). With 256kB PAGE_SIZE, the ENTRIES_PER_PAGEPAGE constant becomes too large (0x1.0000.0000) on a 32-bit kernel, so this patch just changes its type from 'unsigned long' to 'unsigned long long'. Hugh: reverted its unsigned long longs in shmem_truncate_range() and shmem_getpage(): the pagecache index cannot be more than an unsigned long, so the divisions by zero occurred in unreached code. It's a pity we need any ULL arithmetic here, but I found no pretty way to avoid it. Signed-off-by: Yuri Tikhonov Signed-off-by: Hugh Dickins Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index d94d2e9146b..28024ce8b8f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -66,7 +66,7 @@ static struct vfsmount *shm_mnt; #include #define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long)) -#define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) +#define ENTRIES_PER_PAGEPAGE ((unsigned long long)ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) #define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) #define SHMEM_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1)) -- cgit v1.2.3 From caefba1740d8016e6dfe8fda84f85bdcb8f8c85d Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 13 Apr 2009 14:40:12 -0700 Subject: shmem: respect MAX_LFS_FILESIZE SHMEM_MAX_BYTES was derived from the maximum size of its triple-indirect swap vector, forgetting to take the MAX_LFS_FILESIZE limit into account. Never mind 256kB pages, even 8kB pages on 32-bit kernels allowed files to grow slightly bigger than that supposed maximum. Fix this by using the min of both (at build time not run time). And it happens that this calculation is good as far as 8MB pages on 32-bit or 16MB pages on 64-bit: though SHMSWP_MAX_INDEX gets truncated before that, it's truncated to such large numbers that we don't need to care. [akpm@linux-foundation.org: it needs pagemap.h] [akpm@linux-foundation.org: fix sparc64 min() warnings] Signed-off-by: Hugh Dickins Cc: Yuri Tikhonov Cc: Paul Mackerras Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 28024ce8b8f..f9cb20ebb99 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -43,7 +44,6 @@ static struct vfsmount *shm_mnt; #include #include #include -#include #include #include #include @@ -65,13 +65,28 @@ static struct vfsmount *shm_mnt; #include #include +/* + * The maximum size of a shmem/tmpfs file is limited by the maximum size of + * its triple-indirect swap vector - see illustration at shmem_swp_entry(). + * + * With 4kB page size, maximum file size is just over 2TB on a 32-bit kernel, + * but one eighth of that on a 64-bit kernel. With 8kB page size, maximum + * file size is just over 4TB on a 64-bit kernel, but 16TB on a 32-bit kernel, + * MAX_LFS_FILESIZE being then more restrictive than swap vector layout. + * + * We use / and * instead of shifts in the definitions below, so that the swap + * vector can be tested with small even values (e.g. 20) for ENTRIES_PER_PAGE. + */ #define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long)) #define ENTRIES_PER_PAGEPAGE ((unsigned long long)ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) -#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) -#define SHMEM_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1)) -#define SHMEM_MAX_BYTES ((unsigned long long)SHMEM_MAX_INDEX << PAGE_CACHE_SHIFT) +#define SHMSWP_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1)) +#define SHMSWP_MAX_BYTES (SHMSWP_MAX_INDEX << PAGE_CACHE_SHIFT) +#define SHMEM_MAX_BYTES min_t(unsigned long long, SHMSWP_MAX_BYTES, MAX_LFS_FILESIZE) +#define SHMEM_MAX_INDEX ((unsigned long)((SHMEM_MAX_BYTES+1) >> PAGE_CACHE_SHIFT)) + +#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) #define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) /* info->flags needs VM_flags to handle pagein/truncate races efficiently */ @@ -2581,7 +2596,7 @@ int shmem_unuse(swp_entry_t entry, struct page *page) #define shmem_get_inode(sb, mode, dev, flags) ramfs_get_inode(sb, mode, dev) #define shmem_acct_size(flags, size) 0 #define shmem_unacct_size(flags, size) do {} while (0) -#define SHMEM_MAX_BYTES LLONG_MAX +#define SHMEM_MAX_BYTES MAX_LFS_FILESIZE #endif /* CONFIG_SHMEM */ -- cgit v1.2.3 From f69955855eac55a048d26a1618f50dfaa160a006 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 15 Apr 2009 13:22:37 -0400 Subject: Export filemap_write_and_wait_range This wasn't exported before and is useful (used by the experimental ext3 data=guarded code) Signed-off-by: Chris Mason Acked-by: Theodore Tso Acked-by: Jan Kara Signed-off-by: Linus Torvalds --- mm/filemap.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 8bd498040f3..379ff0bcbf6 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -441,6 +441,7 @@ int filemap_write_and_wait_range(struct address_space *mapping, } return err; } +EXPORT_SYMBOL(filemap_write_and_wait_range); /** * add_to_page_cache_locked - add a locked page to the pagecache -- cgit v1.2.3 From 05fa199d45c54a9bda7aa3ae6537253d6f097aa9 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 16 Apr 2009 21:58:12 +0100 Subject: mm: pass correct mm when growing stack Tetsuo Handa reports seeing the WARN_ON(current->mm == NULL) in security_vm_enough_memory(), when do_execve() is touching the target mm's stack, to set up its args and environment. Yes, a UMH_NO_WAIT or UMH_WAIT_PROC call_usermodehelper() spawns an mm-less kernel thread to do the exec. And in any case, that vm_enough_memory check when growing stack ought to be done on the target mm, not on the execer's mm (though apart from the warning, it only makes a slight tweak to OVERCOMMIT_NEVER behaviour). Reported-by: Tetsuo Handa Signed-off-by: Hugh Dickins Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- mm/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 4a3841186c1..3303d1ba8e8 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1575,7 +1575,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns * Overcommit.. This must be the final test, as it will * update security statistics. */ - if (security_vm_enough_memory(grow)) + if (security_vm_enough_memory_mm(mm, grow)) return -ENOMEM; /* Ok, everything looks good - let it rip */ -- cgit v1.2.3 From a21e25536169432cf9174d631972bc1cd4c75062 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 18 Apr 2009 17:23:41 +0200 Subject: PM/Hibernate: Fix memory shrinking Commit d979677c4c0 ("mm: shrink_all_memory(): use sc.nr_reclaimed") broke the memory shrinking used by hibernation, becuse it did not update shrink_all_zones() in accordance with the other changes it made. Fix this by making shrink_all_zones() update sc->nr_reclaimed instead of overwriting its value. This fixes http://bugzilla.kernel.org/show_bug.cgi?id=13058 Reported-and-tested-by: Alan Jenkins Signed-off-by: Rafael J. Wysocki Signed-off-by: Linus Torvalds --- mm/vmscan.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 39fdfb14eea..99155b7b812 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2088,13 +2088,13 @@ static void shrink_all_zones(unsigned long nr_pages, int prio, nr_reclaimed += shrink_list(l, nr_to_scan, zone, sc, prio); if (nr_reclaimed >= nr_pages) { - sc->nr_reclaimed = nr_reclaimed; + sc->nr_reclaimed += nr_reclaimed; return; } } } } - sc->nr_reclaimed = nr_reclaimed; + sc->nr_reclaimed += nr_reclaimed; } /* @@ -2115,6 +2115,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) .may_unmap = 0, .may_writepage = 1, .isolate_pages = isolate_pages_global, + .nr_reclaimed = 0, }; current->reclaim_state = &reclaim_state; -- cgit v1.2.3 From 2e2e425989080cc534fc0fca154cae515f971cf5 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 21 Apr 2009 12:24:57 -0700 Subject: vmscan,memcg: reintroduce sc->may_swap Commit a6dc60f8975ad96d162915e07703a4439c80dcf0 ("vmscan: rename sc.may_swap to may_unmap") removed the may_swap flag, but memcg had used it as a flag for "we need to use swap?", as the name indicate. And in the current implementation, memcg cannot reclaim mapped file caches when mem+swap hits the limit. re-introduce may_swap flag and handle it at get_scan_ratio(). This patch doesn't influence any scan_control users other than memcg. Signed-off-by: KOSAKI Motohiro Signed-off-by: Daisuke Nishimura Acked-by: Johannes Weiner Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 99155b7b812..eac9577941f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -63,6 +63,9 @@ struct scan_control { /* Can mapped pages be reclaimed? */ int may_unmap; + /* Can pages be swapped as part of reclaim? */ + int may_swap; + /* This context's SWAP_CLUSTER_MAX. If freeing memory for * suspend, we effectively ignore SWAP_CLUSTER_MAX. * In this context, it doesn't matter that we scan the @@ -1380,7 +1383,7 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); /* If we have no swap space, do not bother scanning anon pages. */ - if (nr_swap_pages <= 0) { + if (!sc->may_swap || (nr_swap_pages <= 0)) { percent[0] = 0; percent[1] = 100; return; @@ -1697,6 +1700,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .may_writepage = !laptop_mode, .swap_cluster_max = SWAP_CLUSTER_MAX, .may_unmap = 1, + .may_swap = 1, .swappiness = vm_swappiness, .order = order, .mem_cgroup = NULL, @@ -1717,6 +1721,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, struct scan_control sc = { .may_writepage = !laptop_mode, .may_unmap = 1, + .may_swap = !noswap, .swap_cluster_max = SWAP_CLUSTER_MAX, .swappiness = swappiness, .order = 0, @@ -1726,9 +1731,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, }; struct zonelist *zonelist; - if (noswap) - sc.may_unmap = 0; - sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); zonelist = NODE_DATA(numa_node_id())->node_zonelists; @@ -1767,6 +1769,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) struct scan_control sc = { .gfp_mask = GFP_KERNEL, .may_unmap = 1, + .may_swap = 1, .swap_cluster_max = SWAP_CLUSTER_MAX, .swappiness = vm_swappiness, .order = order, @@ -2298,6 +2301,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) struct scan_control sc = { .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), + .may_swap = 1, .swap_cluster_max = max_t(unsigned long, nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = gfp_mask, -- cgit v1.2.3